+++ /dev/null
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+++ /dev/null
-# cuda-convnet2
-Automatically exported from code.google.com/p/cuda-convnet2
-
-You can read the documentation in two ways:
-
-1. On this site: go to branches > wiki.
-2. On Google Code (for now?): https://code.google.com/p/cuda-convnet2/
+++ /dev/null
-#!/bin/sh
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-###############################################################################
-
-# Fill in the below environment variables.
-#
-# If you're not sure what these paths should be,
-# you can use the find command to try to locate them.
-# For example, NUMPY_INCLUDE_PATH contains the file
-# arrayobject.h. So you can search for it like this:
-#
-# find /usr -name arrayobject.h
-#
-# (it'll almost certainly be under /usr)
-
-# CUDA toolkit installation directory.
-export CUDA_INSTALL_PATH=/usr/local/cuda
-
-# Python include directory. This should contain the file Python.h, among others.
-export PYTHON_INCLUDE_PATH=/usr/include/python2.7
-
-# Numpy include directory. This should contain the file arrayobject.h, among others.
-export NUMPY_INCLUDE_PATH=/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/
-
-# ATLAS library directory. This should contain the file libcblas.so, among others.
-export ATLAS_LIB_PATH=/usr/lib/atlas-base
-
-# You don't have to change these:
-export LD_LIBRARY_PATH=$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH
-export CUDA_SDK_PATH=$CUDA_INSTALL_PATH/samples
-export PATH=$PATH:$CUDA_INSTALL_PATH/bin
-
-cd util && make numpy=1 -j $* && cd ..
-cd nvmatrix && make -j $* && cd ..
-cd cudaconv3 && make -j $* && cd ..
-cd cudaconvnet && make -j $* && cd ..
-cd make-data/pyext && make -j $* && cd ../..
-
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from python_util.data import *
-import numpy.random as nr
-import numpy as n
-import random as r
-from time import time
-from threading import Thread
-from math import sqrt
-import sys
-#from matplotlib import pylab as pl
-from PIL import Image
-from StringIO import StringIO
-from time import time
-import itertools as it
-
-class JPEGBatchLoaderThread(Thread):
- def __init__(self, dp, batch_num, label_offset, list_out):
- Thread.__init__(self)
- self.list_out = list_out
- self.label_offset = label_offset
- self.dp = dp
- self.batch_num = batch_num
-
- @staticmethod
- def load_jpeg_batch(rawdics, dp, label_offset):
- if type(rawdics) != list:
- rawdics = [rawdics]
- nc_total = sum(len(r['data']) for r in rawdics)
-
- jpeg_strs = list(it.chain.from_iterable(rd['data'] for rd in rawdics))
- labels = list(it.chain.from_iterable(rd['labels'] for rd in rawdics))
-
- img_mat = n.empty((nc_total * dp.data_mult, dp.inner_pixels * dp.num_colors), dtype=n.float32)
- lab_mat = n.zeros((nc_total, dp.get_num_classes()), dtype=n.float32)
- dp.convnet.libmodel.decodeJpeg(jpeg_strs, img_mat, dp.img_size, dp.inner_size, dp.test, dp.multiview)
- lab_vec = n.tile(n.asarray([(l[nr.randint(len(l))] if len(l) > 0 else -1) + label_offset for l in labels], dtype=n.single).reshape((nc_total, 1)), (dp.data_mult,1))
- for c in xrange(nc_total):
- lab_mat[c, [z + label_offset for z in labels[c]]] = 1
- lab_mat = n.tile(lab_mat, (dp.data_mult, 1))
-
-
- return {'data': img_mat[:nc_total * dp.data_mult,:],
- 'labvec': lab_vec[:nc_total * dp.data_mult,:],
- 'labmat': lab_mat[:nc_total * dp.data_mult,:]}
-
- def run(self):
- rawdics = self.dp.get_batch(self.batch_num)
- p = JPEGBatchLoaderThread.load_jpeg_batch(rawdics,
- self.dp,
- self.label_offset)
- self.list_out.append(p)
-
-class ColorNoiseMakerThread(Thread):
- def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
- Thread.__init__(self)
- self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
- self.num_noise = num_noise
- self.list_out = list_out
-
- def run(self):
- noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
- self.list_out.append(noise)
-
-class ImageDataProvider(LabeledDataProvider):
- def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
- LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
- self.data_mean = self.batch_meta['data_mean'].astype(n.single)
- self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
- self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
- self.color_noise_coeff = dp_params['color_noise']
- self.num_colors = 3
- self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
- self.mini = dp_params['minibatch_size']
- self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.img_size
- self.inner_pixels = self.inner_size **2
- self.border_size = (self.img_size - self.inner_size) / 2
- self.multiview = dp_params['multiview_test'] and test
- self.num_views = 5*2
- self.data_mult = self.num_views if self.multiview else 1
- self.batch_size = self.batch_meta['batch_size']
- self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
- self.scalar_mean = dp_params['scalar_mean']
- # Maintain pointers to previously-returned data matrices so they don't get garbage collected.
- self.data = [None, None] # These are pointers to previously-returned data matrices
-
- self.loader_thread, self.color_noise_thread = None, None
- self.convnet = dp_params['convnet']
-
- self.num_noise = self.batch_size
- self.batches_generated, self.loaders_started = 0, 0
- self.data_mean_crop = self.data_mean.reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
-
- if self.scalar_mean >= 0:
- self.data_mean_crop = self.scalar_mean
-
- def showimg(self, img):
- from matplotlib import pylab as pl
- pixels = img.shape[0] / 3
- size = int(sqrt(pixels))
- img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
- pl.imshow(img, interpolation='nearest')
- pl.show()
-
- def get_data_dims(self, idx=0):
- if idx == 0:
- return self.inner_size**2 * 3
- if idx == 2:
- return self.get_num_classes()
- return 1
-
- def start_loader(self, batch_idx):
- self.load_data = []
- self.loader_thread = JPEGBatchLoaderThread(self,
- self.batch_range[batch_idx],
- self.label_offset,
- self.load_data)
- self.loader_thread.start()
-
- def start_color_noise_maker(self):
- color_noise_list = []
- self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
- self.color_noise_thread.start()
- return color_noise_list
-
- def set_labels(self, datadic):
- pass
-
- def get_data_from_loader(self):
- if self.loader_thread is None:
- self.start_loader(self.batch_idx)
- self.loader_thread.join()
- self.data[self.d_idx] = self.load_data[0]
-
- self.start_loader(self.get_next_batch_idx())
- else:
- # Set the argument to join to 0 to re-enable batch reuse
- self.loader_thread.join()
- if not self.loader_thread.is_alive():
- self.data[self.d_idx] = self.load_data[0]
- self.start_loader(self.get_next_batch_idx())
- #else:
- # print "Re-using batch"
- self.advance_batch()
-
- def add_color_noise(self):
- # At this point the data already has 0 mean.
- # So I'm going to add noise to it, but I'm also going to scale down
- # the original data. This is so that the overall scale of the training
- # data doesn't become too different from the test data.
-
- s = self.data[self.d_idx]['data'].shape
- cropped_size = self.get_data_dims(0) / 3
- ncases = s[0]
-
- if self.color_noise_thread is None:
- self.color_noise_list = self.start_color_noise_maker()
- self.color_noise_thread.join()
- self.color_noise = self.color_noise_list[0]
- self.color_noise_list = self.start_color_noise_maker()
- else:
- self.color_noise_thread.join(0)
- if not self.color_noise_thread.is_alive():
- self.color_noise = self.color_noise_list[0]
- self.color_noise_list = self.start_color_noise_maker()
-
- self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases*3, cropped_size))
- self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
- self.data[self.d_idx]['data'] += self.color_noise * self.color_noise_coeff
- self.data[self.d_idx]['data'] = self.data[self.d_idx]['data'].reshape((ncases, 3* cropped_size))
- self.data[self.d_idx]['data'] *= 1.0 / (1.0 + self.color_noise_coeff) # <--- NOTE: This is the slow line, 0.25sec. Down from 0.75sec when I used division.
-
- def get_next_batch(self):
- self.d_idx = self.batches_generated % 2
- epoch, batchnum = self.curr_epoch, self.curr_batchnum
-
- self.get_data_from_loader()
-
- # Subtract mean
- self.data[self.d_idx]['data'] -= self.data_mean_crop
-
- if self.color_noise_coeff > 0 and not self.test:
- self.add_color_noise()
- self.batches_generated += 1
-
- return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labvec'].T, self.data[self.d_idx]['labmat'].T]
-
-
- # Takes as input an array returned by get_next_batch
- # Returns a (numCases, imgSize, imgSize, 3) array which can be
- # fed to pylab for plotting.
- # This is used by shownet.py to plot test case predictions.
- def get_plottable_data(self, data, add_mean=True):
- mean = self.data_mean_crop.reshape((data.shape[0],1)) if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.reshape((data.shape[0],1))
- return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
-
-class CIFARDataProvider(LabeledDataProvider):
- def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
- LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
- self.img_size = 32
- self.num_colors = 3
- self.inner_size = dp_params['inner_size'] if dp_params['inner_size'] > 0 else self.batch_meta['img_size']
- self.border_size = (self.img_size - self.inner_size) / 2
- self.multiview = dp_params['multiview_test'] and test
- self.num_views = 9
- self.scalar_mean = dp_params['scalar_mean']
- self.data_mult = self.num_views if self.multiview else 1
- self.data_dic = []
- for i in batch_range:
- self.data_dic += [unpickle(self.get_data_file_name(i))]
- self.data_dic[-1]["labels"] = n.require(self.data_dic[-1]['labels'], dtype=n.single)
- self.data_dic[-1]["labels"] = n.require(n.tile(self.data_dic[-1]["labels"].reshape((1, n.prod(self.data_dic[-1]["labels"].shape))), (1, self.data_mult)), requirements='C')
- self.data_dic[-1]['data'] = n.require(self.data_dic[-1]['data'] - self.scalar_mean, dtype=n.single, requirements='C')
-
- self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
-
- self.batches_generated = 0
- self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
-
- def get_next_batch(self):
- epoch, batchnum = self.curr_epoch, self.curr_batchnum
- self.advance_batch()
- bidx = batchnum - self.batch_range[0]
-
- cropped = self.cropped_data[self.batches_generated % 2]
-
- self.__trim_borders(self.data_dic[bidx]['data'], cropped)
- cropped -= self.data_mean
- self.batches_generated += 1
- return epoch, batchnum, [cropped, self.data_dic[bidx]['labels']]
-
- def get_data_dims(self, idx=0):
- return self.inner_size**2 * self.num_colors if idx == 0 else 1
-
- # Takes as input an array returned by get_next_batch
- # Returns a (numCases, imgSize, imgSize, 3) array which can be
- # fed to pylab for plotting.
- # This is used by shownet.py to plot test case predictions.
- def get_plottable_data(self, data):
- return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
-
- def __trim_borders(self, x, target):
- y = x.reshape(self.num_colors, self.img_size, self.img_size, x.shape[1])
-
- if self.test: # don't need to loop over cases
- if self.multiview:
- start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
- (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
- (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
- end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
- for i in xrange(self.num_views):
- target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
- else:
- pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
- target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
- else:
- for c in xrange(x.shape[1]): # loop over cases
- startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
- endY, endX = startY + self.inner_size, startX + self.inner_size
- pic = y[:,startY:endY,startX:endX, c]
- if nr.randint(2) == 0: # also flip the image with 50% probability
- pic = pic[:,:,::-1]
- target[:,c] = pic.reshape((self.get_data_dims(),))
-
-class DummyConvNetLogRegDataProvider(LabeledDummyDataProvider):
- def __init__(self, data_dim):
- LabeledDummyDataProvider.__init__(self, data_dim)
-
- self.img_size = int(sqrt(data_dim/3))
-
- def get_next_batch(self):
- epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
- dic = {'data': dic[0], 'labels': dic[1]}
- print dic['data'].shape, dic['labels'].shape
- return epoch, batchnum, [dic['data'], dic['labels']]
-
- # Returns the dimensionality of the two data matrices returned by get_next_batch
- def get_data_dims(self, idx=0):
- return self.batch_meta['num_vis'] if idx == 0 else 1
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as n
-import numpy.random as nr
-import random as r
-from python_util.util import *
-from python_util.data import *
-from python_util.options import *
-from python_util.gpumodel import *
-import sys
-import math as m
-import layer as lay
-from convdata import ImageDataProvider, CIFARDataProvider, DummyConvNetLogRegDataProvider
-from os import linesep as NL
-import copy as cp
-import os
-
-class Driver(object):
- def __init__(self, convnet):
- self.convnet = convnet
-
- def on_start_batch(self, batch_data, train):
- pass
-
- def on_finish_batch(self):
- pass
-
-class GradCheckDriver(Driver):
- def on_start_batch(self, batch_data, train):
- data = batch_data[2]
- self.convnet.libmodel.checkGradients(data)
-
-class TrainingDriver(Driver):
- def on_start_batch(self, batch_data, train):
- data = batch_data[2]
- self.convnet.libmodel.startBatch(data, self.convnet.get_progress(), not train)
-
-class MultiviewTestDriver(TrainingDriver):
- def on_start_batch(self, batch_data, train):
- self.write_output = False
- if train:
- TrainingDriver.on_start_batch(self, batch_data, train)
- else:
- data = batch_data[2]
- num_views = self.convnet.test_data_provider.num_views
- if self.convnet.test_out != "" and self.convnet.logreg_name != "":
- self.write_output = True
- self.test_file_name = os.path.join(self.convnet.test_out, 'test_preds_%d' % batch_data[1])
- self.probs = n.zeros((data[0].shape[1]/num_views, self.convnet.test_data_provider.get_num_classes()), dtype=n.single)
- self.convnet.libmodel.startMultiviewTest(data, num_views, self.probs, self.convnet.logreg_name)
- else:
- self.convnet.libmodel.startMultiviewTest(data, num_views)
-
- def on_finish_batch(self):
- if self.write_output:
- if not os.path.exists(self.convnet.test_out):
- os.makedirs(self.convnet.test_out)
- pickle(self.test_file_name, {'data': self.probs,
- 'note': 'generated from %s' % self.convnet.save_file})
-
-class FeatureWriterDriver(Driver):
- def __init__(self, convnet):
- Driver.__init__(self, convnet)
- self.last_batch = convnet.test_batch_range[-1]
-
- def on_start_batch(self, batch_data, train):
- if train:
- raise ModelStateException("FeatureWriter must be used in conjunction with --test-only=1. It writes test data features.")
-
- self.batchnum, self.data = batch_data[1], batch_data[2]
-
- if not os.path.exists(self.convnet.feature_path):
- os.makedirs(self.convnet.feature_path)
-
- self.num_ftrs = self.convnet.layers[self.convnet.write_features]['outputs']
- self.ftrs = n.zeros((self.data[0].shape[1], self.num_ftrs), dtype=n.single)
- self.convnet.libmodel.startFeatureWriter(self.data, [self.ftrs], [self.convnet.write_features])
-
- def on_finish_batch(self):
- path_out = os.path.join(self.convnet.feature_path, 'data_batch_%d' % self.batchnum)
- pickle(path_out, {'data': self.ftrs, 'labels': self.data[1]})
- print "Wrote feature file %s" % path_out
- if self.batchnum == self.last_batch:
- pickle(os.path.join(self.convnet.feature_path, 'batches.meta'), {'source_model':self.convnet.load_file,
- 'num_vis':self.num_ftrs,
- 'batch_size': self.convnet.test_data_provider.batch_meta['batch_size']})
-
-class ConvNet(IGPUModel):
- def __init__(self, op, load_dic, dp_params={}):
- filename_options = []
- for v in ('color_noise', 'multiview_test', 'inner_size', 'scalar_mean', 'minibatch_size'):
- dp_params[v] = op.get_value(v)
-
- IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
-
- def import_model(self):
- lib_name = "cudaconvnet._ConvNet"
- print "========================="
- print "Importing %s C++ module" % lib_name
- self.libmodel = __import__(lib_name,fromlist=['_ConvNet'])
-
- def init_model_lib(self):
- self.libmodel.initModel(self.layers,
- self.device_ids,
- self.minibatch_size,
- self.conserve_mem)
-
- def init_model_state(self):
- ms = self.model_state
- layers = ms['layers'] if self.loaded_from_checkpoint else {}
- ms['layers'] = lay.LayerParser.parse_layers(os.path.join(self.layer_path, self.layer_def),
- os.path.join(self.layer_path, self.layer_params), self, layers=layers)
-
- self.do_decouple_conv()
- self.do_unshare_weights()
-
- self.op.set_value('conv_to_local', [], parse=False)
- self.op.set_value('unshare_weights', [], parse=False)
-
- self.set_driver()
-
- def do_decouple_conv(self):
- # Convert convolutional layers to local
- if len(self.op.get_value('conv_to_local')) > 0:
- for lname in self.op.get_value('conv_to_local'):
- if self.model_state['layers'][lname]['type'] == 'conv':
- lay.LocalLayerParser.conv_to_local(self.model_state['layers'], lname)
-
- def do_unshare_weights(self):
- # Decouple weight matrices
- if len(self.op.get_value('unshare_weights')) > 0:
- for name_str in self.op.get_value('unshare_weights'):
- if name_str:
- name = lay.WeightLayerParser.get_layer_name(name_str)
- if name is not None:
- name, idx = name[0], name[1]
- if name not in self.model_state['layers']:
- raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
- layer = self.model_state['layers'][name]
- lay.WeightLayerParser.unshare_weights(layer, self.model_state['layers'], matrix_idx=idx)
- else:
- raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
-
- def set_driver(self):
- if self.op.get_value('check_grads'):
- self.driver = GradCheckDriver(self)
- elif self.op.get_value('multiview_test'):
- self.driver = MultiviewTestDriver(self)
- elif self.op.get_value('write_features'):
- self.driver = FeatureWriterDriver(self)
- else:
- self.driver = TrainingDriver(self)
-
- def fill_excused_options(self):
- if self.op.get_value('check_grads'):
- self.op.set_value('save_path', '')
- self.op.set_value('train_batch_range', '0')
- self.op.set_value('test_batch_range', '0')
- self.op.set_value('data_path', '')
-
- # Make sure the data provider returned data in proper format
- def parse_batch_data(self, batch_data, train=True):
- if max(d.dtype != n.single for d in batch_data[2]):
- raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
- return batch_data
-
- def start_batch(self, batch_data, train=True):
- self.driver.on_start_batch(batch_data, train)
-
- def finish_batch(self):
- ret = IGPUModel.finish_batch(self)
- self.driver.on_finish_batch()
- return ret
-
- def print_iteration(self):
- print "%d.%d (%.2f%%)..." % (self.epoch, self.batchnum, 100 * self.get_progress()),
-
- def print_train_time(self, compute_time_py):
- print "(%.3f sec)" % (compute_time_py)
-
- def print_costs(self, cost_outputs):
- costs, num_cases = cost_outputs[0], cost_outputs[1]
- children = set()
- for errname in costs:
- if sum(errname in self.layers[z]['children'] for z in costs) == 0:
-# print self.layers[errname]['children']
- for child in set(self.layers[errname]['children']) & set(costs.keys()):
- costs[errname] = [v + u for v, u in zip(costs[errname], costs[child])]
- children.add(child)
-
- filtered_costs = eval(self.layers[errname]['outputFilter'])(costs[errname], num_cases)
- print "%s: " % errname,
- if 'outputFilterFormatter' not in self.layers[errname]:
- print ", ".join("%.6f" % v for v in filtered_costs),
- else:
- print eval(self.layers[errname]['outputFilterFormatter'])(self,filtered_costs),
- if m.isnan(filtered_costs[0]) or m.isinf(filtered_costs[0]):
- print "<- error nan or inf!"
- sys.exit(1)
- for c in children:
- del costs[c]
-
- def print_train_results(self):
- self.print_costs(self.train_outputs[-1])
-
- def print_test_status(self):
- pass
-
- def print_test_results(self):
- print NL + "======================Test output======================"
- self.print_costs(self.test_outputs[-1])
- if not self.test_only:
- print NL + "----------------------Averages-------------------------"
- self.print_costs(self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):]))
- print NL + "-------------------------------------------------------",
- for name,val in sorted(self.layers.items(), key=lambda x: x[1]['id']): # This is kind of hacky but will do for now.
- l = self.layers[name]
- if 'weights' in l:
- wscales = [(l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))]
- print ""
- print NL.join("Layer '%s' weights[%d]: %e [%e] [%e]" % (s[0], s[1], s[2], s[3], s[3]/s[2] if s[2] > 0 else 0) for s in wscales),
- print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
- print ""
-
- def conditional_save(self):
- self.save_state()
-
- def aggregate_test_outputs(self, test_outputs):
- test_outputs = cp.deepcopy(test_outputs)
- num_cases = sum(t[1] for t in test_outputs)
- for i in xrange(1 ,len(test_outputs)):
- for k,v in test_outputs[i][0].items():
- for j in xrange(len(v)):
- test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
-
- return (test_outputs[0][0], num_cases)
-
- @classmethod
- def get_options_parser(cls):
- op = IGPUModel.get_options_parser()
- op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
- op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=False)
- op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
- op.add_option("layer-path", "layer_path", StringOptionParser, "Layer file path prefix", default="")
- op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path', 'save_file_override', 'train_batch_range','test_batch_range'])
- op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
- op.add_option("inner-size", "inner_size", IntegerOptionParser, "Cropped DP: crop size (0 = don't crop)", default=0, set_once=True)
- op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
- op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
- op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
- op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
- op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
- op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
- op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract this scalar from image (-1 = don't)", default=-1)
-
- op.add_option("write-features", "write_features", StringOptionParser, "Write test data features from given layer", default="", requires=['feature-path'])
- op.add_option("feature-path", "feature_path", StringOptionParser, "Write test data features to this path (to be used with --write-features)", default="")
-
- op.delete_option('max_test_err')
- op.options["testing_freq"].default = 57
- op.options["num_epochs"].default = 50000
- op.options['dp_type'].default = None
-
- DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDataProvider)
- DataProvider.register_data_provider('image', 'JPEG-encoded image data provider', ImageDataProvider)
- DataProvider.register_data_provider('cifar', 'CIFAR-10 data provider', CIFARDataProvider)
-
- return op
-
-if __name__ == "__main__":
-# nr.seed(6)
-
- op = ConvNet.get_options_parser()
-
- op, load_dic = IGPUModel.parse_options(op)
- model = ConvNet(op, load_dic)
- model.start()
+++ /dev/null
-################################################################################
-#
-# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
-#
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users. This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
-#
-################################################################################
-
-# Location of the CUDA Toolkit binaries and libraries
-CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
-CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
-CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
-
-# Common binaries
-NVCC = $(CUDA_BIN_PATH)/nvcc
-GCC = g++
-AR = ar
-
-# CUDA code generation flags
-GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
-GENCODE_FLAGS := $(GENCODE_SM35)
-
-LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
-CCFLAGS := -m64
-NVCCFLAGS := -m64
-
-# Debug build flags
-ifeq ($(dbg),1)
- CCFLAGS += -g
- NVCCFLAGS += -g -G
- DBG := debug
-else
- DBG := release
- NVCCFLAGS += -O3
- CCFLAGS += -O3
-endif
-
-# Add profiler output
-ifeq ($(prof),1)
- NVCCFLAGS += --ptxas-options=-v
-endif
-
-TARGETDIR := ./bin/$(DBG)
-OBJDIR := ./obj/$(DBG)
-
-########## USER STUFF ###########
-LDFLAGS += -L../util -lutilpy -L../nvmatrix -lnvmatrix -lcublas
-INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
-
-CUFILES := $(shell find . -name "*.cu")
-CU_DEPS := $(shell find . -name "*.cuh")
-CCFILES := $(shell find . -name "*.cpp")
-C_DEPS := $(shell find . -name "*.h")
-
-NVCCFLAGS += --compiler-options '-fPIC'
-LDFLAGS += -shared
-CCFLAGS += -fPIC
-TARGET := $(TARGETDIR)/libcudaconv.so
-
-################################################################################
-# Set up target and object files
-################################################################################
-OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
-OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
-OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
-
-# Target rules
-all: makedirs $(TARGET)
-
-$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
- $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
-
-$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
- $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
-
-$(TARGET): $(OBJS)
- $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS)
- ln -sf $(TARGET) .
-
-makedirs:
- mkdir -p $(TARGETDIR)
- mkdir -p $(OBJDIR)/src
-
-clean:
- rm -rf ./obj
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CONV_UTIL_CUH
-#define CONV_UTIL_CUH
-
-#include "../../nvmatrix/include/nvmatrix.cuh"
-
-#include "caffe2/core/context_gpu.h"
-
-#ifndef MIN
-#define MIN(a, b) ((a) > (b) ? (b) : (a))
-#endif
-#ifndef MAX
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
-void convLocalMaxUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX);
-void convLocalAvgUndo(
- NVMatrix& avgGrads,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- int imgSize,
- bool sum);
-
-void convLocalAvgUndo(
- NVMatrix& avgGrads,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- int imgSize,
- bool sum,
- float scaleTargets,
- float scaleOutput);
-void convLocalMaxUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- float scaleTargets,
- float scaleOutput);
-
-void convResponseNorm(
- NVMatrix& images,
- NVMatrix& denoms,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float minDiv);
-void convResponseNormUndo(
- NVMatrix& outGrads,
- NVMatrix& denoms,
- NVMatrix& inputs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float scaleTargets,
- float scaleOutput);
-void convContrastNorm(
- NVMatrix& images,
- NVMatrix& meanDiffs,
- NVMatrix& denoms,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float minDiv);
-void convContrastNormUndo(
- NVMatrix& outGrads,
- NVMatrix& denoms,
- NVMatrix& meanDiffs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float scaleTargets,
- float scaleOutput);
-
-void convGaussianBlur(
- NVMatrix& images,
- NVMatrix& filter,
- NVMatrix& target,
- bool horiz,
- int numChannels,
- float scaleTargets,
- float scaleOutputs);
-void convBedOfNails(
- NVMatrix& images,
- NVMatrix& target,
- int numChannels,
- int imgSize,
- int startX,
- int strideX,
- float scaleTargets,
- float scaleOutput);
-void convBedOfNailsUndo(
- NVMatrix& actsGrad,
- NVMatrix& target,
- int numChannels,
- int imgSize,
- int startX,
- int strideX,
- float scaleTargets,
- float scaleOutput);
-
-void convResizeBilinear(
- NVMatrix& images,
- NVMatrix& target,
- int imgSize,
- int tgtSize,
- float scale);
-void convRGBToYUV(NVMatrix& images, NVMatrix& target);
-void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center);
-void convCrop(
- NVMatrix& imgs,
- NVMatrix& target,
- int imgSize,
- int tgtSize,
- int startY,
- int startX);
-void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm);
-void convContrastNormCrossMap(
- NVMatrix& images,
- NVMatrix& meanDiffs,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked);
-void convResponseNormCrossMapUndo(
- NVMatrix& outGrads,
- NVMatrix& inputs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked,
- float scaleTargets,
- float scaleOutput);
-void convResponseNormCrossMap(
- NVMatrix& images,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- bool blocked);
-void convResponseNormCrossMap(
- NVMatrix& images,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked);
-void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize);
-
-void convCrossMapMaxPoolUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- const int imgSize,
- const int startF,
- const int poolSize,
- const int stride,
- const float scaleTargets,
- const float scaleOutputs);
-
-cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor);
-
-template <bool sum>
-class AvgPooler {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a + b;
- }
- __device__ inline float getBaseValue() const {
- return 0;
- }
- __device__ inline float output(const float a, const int regionSize) const {
- return sum ? a : (a / regionSize);
- }
-};
-
-class MaxPooler {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return fmaxf(a, b);
- }
- __device__ inline float getBaseValue() const {
- return -2e38;
- }
- __device__ inline float output(const float a, const int regionSize) const {
- return a;
- }
-};
-
-class MaxAbsPooler {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return fabsf(a) > fabsf(b) ? a : b;
- }
- __device__ inline float getBaseValue() const {
- return 0.0f;
- }
- __device__ inline float output(const float a, const int regionSize) const {
- return a;
- }
-};
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
- *
- * So each block does one output for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * target: (numFilters, numOutputs, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- */
-
-template <
- class Agg,
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool checkCaseBounds>
-__global__ void kLocalPool(
- float* imgs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int subsX,
- const int startX,
- const int strideX,
- const int outputsX,
- Agg agg) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = DIVUP(numFilters, B_Y * filtersPerThread);
- const int outputIdxX = blockIdx.x / numImgBlocks;
- const int outputIdxY = blockIdx.y / numFilterBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx =
- (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
- const int myFilterIdx = (blockFilterIdx + threadIdx.y * filtersPerThread);
- if (myFilterIdx >= numFilters) {
- return;
- }
-
- const int outputIdx = outputIdxY * outputsX + outputIdxX;
- const int numOutputs = outputsX * outputsX;
- const int imgPixels = imgSize * imgSize;
-
- const int startImgPxX = startX + outputIdxX * strideX;
- const int startImgPxY = startX + outputIdxY * strideX;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs += myFilterIdx * imgPixels * numImages + imgIdx;
- target += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = agg.getBaseValue();
- }
- }
-
- const int loopStartY = MAX(0, startImgPxY);
- const int loopStartX = MAX(0, startImgPxX);
- const int loopEndY = MIN(imgSize, startImgPxY + subsX);
- const int loopEndX = MIN(imgSize, startImgPxX + subsX);
- const int regionSize = (loopEndY - loopStartY) * (loopEndX - loopStartX);
- for (int y = loopStartY; y < loopEndY; y++) {
- for (int x = loopStartX; x < loopEndX; x++) {
- const int imgPx = y * imgSize + x;
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] =
- agg(prod[f][i],
- imgs[(f * imgPixels + imgPx) * numImages + i * B_X]);
- }
- }
- }
- }
- }
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * numOutputs * numImages + i * B_X] =
- agg.output(prod[f][i], regionSize);
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, output idx in batches of B_Y
- *
- * So each block does one pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines output idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * target: (numOutputs, imgPixels, numImages) (out)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- */
-template <class Agg, int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds>
-__global__ void kPoolCrossMap(
- float* imgs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int startF,
- const int poolSize,
- const int numOutputs,
- const int stride,
- Agg agg) {
- const int imgPixels = imgSize * imgSize;
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- // const int numOutputs = DIVUP(numFilters, stride);
- const int numOutputBlocks = DIVUP(numOutputs, B_Y);
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y / numOutputBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int outputIdx = (blockIdx.y % numOutputBlocks) * B_Y + threadIdx.y;
- // const int filterIdx = outputIdx * stride;
-
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- if (outputIdx < numOutputs) {
- imgs += (pxIdx)*numImages + imgIdx;
- target += (outputIdx * imgPixels + pxIdx) * numImages + imgIdx;
-
- float prod[imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] = agg.getBaseValue();
- }
- }
-
- const int myStartF = startF + outputIdx * stride;
- const int loopStartF = max(0, myStartF);
- const int loopEndF = min(numFilters, myStartF + poolSize);
-
- for (int f = loopStartF; f < loopEndF; ++f) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] = agg(prod[i], imgs[f * imgPixels * numImages + i * B_X]);
- }
- }
- }
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- target[i * B_X] = agg.output(prod[i], poolSize);
- }
- }
- }
-}
-
-/*
- * imgs: (numFilters, imgPixels, numImages)
- * target: (numOutputs, imgPixels, numImages)
- */
-template <class Pooler>
-void convPoolCrossMap(
- NVMatrix& images,
- NVMatrix& target,
- const int startF,
- const int poolSize,
- const int numOutputs,
- const int stride,
- const int imgSize,
- Pooler pooler) {
- int numImages = images.getNumCols();
- int imgPixels = imgSize * imgSize;
- int numFilters = images.getNumRows() / imgPixels;
- assert(images.getNumRows() == numFilters * imgPixels);
-
- assert(!images.isTrans());
- assert(!target.isTrans());
- assert(images.isContiguous());
- // assert(numFilters % 4 == 0);
- // assert(numImages % 128 == 0);
- assert(stride <= poolSize);
- assert(startF <= 0);
- assert(
- startF + (numOutputs - 1) * stride + poolSize >=
- numFilters); // All filters must be covered
-
- cudaStream_t stream = NVMatrix::getDefaultStream();
- target.resize(imgPixels * numOutputs, numImages);
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
-
- dim3 threads(32, 4);
- dim3 blocks(
- imgSize * DIVUP(numImages, threads.x * imgsPerThread),
- imgSize * DIVUP(numOutputs, threads.y));
- bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
- if (!checkCaseBounds) {
- if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(
- kPoolCrossMap<Pooler, 4, 32, 4, false>, cudaFuncCachePreferShared);
- kPoolCrossMap<Pooler, 4, 32, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- pooler);
-
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(
- kPoolCrossMap<Pooler, 4, 32, 2, false>, cudaFuncCachePreferShared);
- kPoolCrossMap<Pooler, 4, 32, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- pooler);
-
- } else if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(
- kPoolCrossMap<Pooler, 4, 32, 1, false>, cudaFuncCachePreferShared);
- kPoolCrossMap<Pooler, 4, 32, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- pooler);
- }
- } else {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(
- kPoolCrossMap<Pooler, 4, 32, 1, true>, cudaFuncCachePreferShared);
- kPoolCrossMap<Pooler, 4, 32, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- pooler);
- } else {
- assert(false);
- }
- }
- getLastCudaError("convPoolCrossMap: kernel execution failed");
-}
-
-/*
- * Block size 16xB_X
- * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
- * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
- * batches of filtersPerThread
- *
- * So each block does a 4x4 region for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines pixel idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * target: (numFilters, numOutputs, numImages)
- *
- * B_X one of 8, 16, 32
- * imgsPerThread one of 1, 2, 4, 8, 16
- *
- * B_XximgsPerThread MUST be divisible by 32.
- * Number of filters MUST be divisible by filtersPerThread.
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- *
- * Final write-out will not be fully coalesced unless B_X is 32. But there's a
- * lot more reading than writing here, and the reading is all coalesced, so it
- * should be OK.
- *
- * To be used when the stride is 1 and the pooling region is fairly large.
- */
-template <
- class Agg,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool checkCaseBounds>
-__global__ void kLocalPool2(
- float* imgs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int subsX,
- const int startX,
- const int outputsX,
- Agg agg) {
- __shared__ float shImgs[filtersPerThread][B_X * imgsPerThread];
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / (filtersPerThread);
- const int blockOutputX = 4 * (blockIdx.x / numImgBlocks);
- const int blockOutputY = 4 * (blockIdx.y / numFilterBlocks);
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
-
- // const int blockOutputIdx = blockOutputY * outputsX + blockOutputX;
- const int numOutputs = outputsX * outputsX;
- const int imgPixels = imgSize * imgSize;
-
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- const int loadY = tidx / 32, loadX = tidx % 32;
-
- const int myX = threadIdx.y % 4;
- const int myY = threadIdx.y / 4;
-
- const int myOutputIdxY = blockOutputY + myY;
- const int myOutputIdxX = blockOutputX + myX;
- const int myOutputIdx = myOutputIdxY * outputsX + myOutputIdxX;
-
- const int startImgPxX = startX + blockOutputX;
- const int startImgPxY = startX + blockOutputY;
- const int endImgPxX = startImgPxX + subsX;
- const int endImgPxY = startImgPxY + subsX;
-
- const int myStartImgPxY = startImgPxY + myY;
- const int myStartImgPxX = startImgPxX + myX;
- const int myEndImgPxY = endImgPxY + myY;
- const int myEndImgPxX = endImgPxX + myX;
-
- const int loopStartY = MAX(startImgPxY, 0);
- const int loopStartX = MAX(startImgPxX, 0);
- const int loopEndY = MIN(imgSize, endImgPxY + 3);
- const int loopEndX = MIN(imgSize, endImgPxX + 3);
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs +=
- (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
- target += (blockFilterIdx * numOutputs + myOutputIdx) * numImages + imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = agg.getBaseValue();
- }
- }
- int regionSize = 0;
- for (int y = loopStartY; y < loopEndY; y++) {
- const bool isInY = y >= myStartImgPxY && y < myEndImgPxY;
- for (int x = loopStartX; x < loopEndX; x++) {
- // Load a pixel
- const int px = y * imgSize + x;
-#pragma unroll
- for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
- if (filtersPerThread % (B_X / 2) == 0 ||
- ly + loadY < filtersPerThread) {
-#pragma unroll
- for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
- if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
- shImgs[ly + loadY][lx + loadX] =
- imgs[(ly * imgPixels + px) * numImages + lx];
- }
- }
- }
- }
- __syncthreads();
-
- // Is this pixel in my region?
- if (isInY && x >= myStartImgPxX && x < myEndImgPxX) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] = agg(prod[f][i], shImgs[f][threadIdx.x + i * B_X]);
- }
- }
- }
- ++regionSize;
- }
- __syncthreads();
- }
- }
- if (myOutputIdxY < outputsX && myOutputIdxX < outputsX) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * numOutputs * numImages + i * B_X] =
- agg.output(prod[f][i], regionSize);
- }
- }
- }
- }
-}
-
-/*
- * imgs: (numFilters, imgPixels, numImages)
- * target: (numFilters, outputs, numImages)
- */
-template <class Pooler>
-void convLocalPool(
- NVMatrix& images,
- NVMatrix& target,
- int numFilters,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- Pooler pooler) {
- int numImages = images.getNumCols();
- int imgPixels = images.getNumRows() / numFilters;
- assert(images.getNumRows() == numFilters * imgPixels);
- int imgSize = int(sqrt(imgPixels));
- assert(imgSize * imgSize == imgPixels);
-
- assert(!images.isTrans());
- assert(!target.isTrans());
- assert(images.isContiguous());
- // assert(numFilters % 4 == 0);
- // assert(numImages % 128 == 0);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- int outputs = outputsX * outputsX;
- target.resize(numFilters * outputs, numImages);
-
- if (strideX == 1 && subsX >= 6 && outputsX > 1) {
- // NOTE: this part has not been optimized for Kepler
- int imgsPerThread = numImages % 128 == 0 ? 8 : 4;
- int filtersPerThread = numFilters % 4 == 0
- ? 4
- : numFilters % 3 == 0 ? 3 : numFilters % 2 == 0 ? 2 : 1;
- int bx = 8;
- bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
- assert((imgsPerThread * bx) % 32 == 0);
- assert(numFilters % filtersPerThread == 0);
- dim3 threads(bx, 16);
- dim3 blocks(
- DIVUP(outputsX, 4) * DIVUP(numImages, bx * imgsPerThread),
- DIVUP(outputsX, 4) * numFilters / filtersPerThread);
- // printf("threads: %dx%d, blocks: %dx%d, imgSize: %d, numFilters:
- // %d, numImages: %d, subsX: %d, startX: %d, outputsX: %d\n",
- // threads.y, threads.x, blocks.y, blocks.x, imgSize,
- // numFilters, numImages, subsX, startX, outputsX);
- if (imgsPerThread == 8) {
- if (filtersPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 1, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 1, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 2) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 2, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 2, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 3) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 3, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 3, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 3, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 3, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 4, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 8, 4, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 8, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- }
- } else if (imgsPerThread == 4) {
- if (filtersPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 1, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 1, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 2) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 2, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 2, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 3) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 3, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 3, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 3, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 3, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- } else if (filtersPerThread == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 4, true>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool2<Pooler, 8, 4, 4, false>, cudaFuncCachePreferShared);
- kLocalPool2<Pooler, 8, 4, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- outputsX,
- pooler);
- }
- }
- }
- } else {
- int filtersPerThread = numFilters % 16 == 0 ? 4 : 1;
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, 32 * imgsPerThread) * outputsX,
- DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
- if (imgsPerThread == 4) {
- if (filtersPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 4, 1, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 4, 1, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 4, 1, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 4, 4, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 4, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 4, 4, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 4, 4, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- }
- } else if (imgsPerThread == 2) {
- if (filtersPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 2, 1, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 2, 1, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 2, 1, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 2, 4, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 2, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 2, 4, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 2, 4, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- }
- } else {
- if (filtersPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 1, 1, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 1, 1, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 1, 1, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 1, 4, true>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 1, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- } else {
- cudaFuncSetCacheConfig(
- kLocalPool<Pooler, 4, 32, 1, 4, false>, cudaFuncCachePreferL1);
- kLocalPool<Pooler, 4, 32, 1, 4, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- pooler);
- }
- }
- }
- }
- getLastCudaError("convLocalPool: kernel execution failed");
-}
-
-#endif /* CONV_UTIL_CUH */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef COMMON_CUH
-#define COMMON_CUH
-
-#include <helper_cuda.h> // helper functions CUDA error checking and initialization
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "conv_util.cuh"
-
-#include "caffe2/core/context_gpu.h"
-
-enum FILTER_OUTPUT_ORDER { MODULE_FILTER_IMAGE, FILTER_MODULE_IMAGE };
-
-void convFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups);
-void convFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput);
-
-void localFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups);
-void localFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput);
-
-void convImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups);
-void convImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput);
-
-void localImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups);
-void localImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput);
-
-void convWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- int sumWidth);
-void convWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- int sumWidth,
- float scaleTargets,
- float scaleOutput);
-
-void localWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups);
-
-void localWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput);
-
-#endif /* COMMON_CUH */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <cstring>
-#include <iostream>
-
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../../nvmatrix/include/nvmatrix_kernels.cuh"
-#include "../include/conv_util.cuh"
-
-using namespace std;
-
-__device__ inline float square(const float a) {
- return a * a;
-}
-
-/*
- * Horizontal reflection.
- * imgs: (numColors, imgSize, imgSize, numCases)
- * targets: (numColors, imgSize, imgSize, numCases)
- *
- * targets should be a different array from imgs.
- *
- * Block size: (4, 32)
- * blockIdx.y * 4 + threadIdx.y determines pixel
- * blockIdx.x * 32 * imgsPerThread + threadIdx.x determines case batch
- *
- */
-template <int numColors, int imgsPerThread, bool checkCaseBounds>
-__global__ void
-kReflectH(float* imgs, float* targets, const int imgSize, const int numCases) {
- const int pxIdx = blockIdx.y * 4 + threadIdx.y;
- const int imgPixels = imgSize * imgSize;
-
- if (pxIdx < imgPixels) {
- const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
- const int pxIdxY = pxIdx / imgSize;
- const int pxIdxX = pxIdx % imgSize;
-
- const int pxIdxXR = imgSize - 1 - pxIdxX; // reflected coordinate
- const int pxIdxR = pxIdxY * imgSize + pxIdxXR;
-
- imgs += pxIdx * numCases + caseIdx;
- targets += pxIdxR * numCases + caseIdx;
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; ++i) {
- if (!checkCaseBounds || caseIdx + i * 32 < numCases) {
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
- targets[c * imgPixels * numCases + i * 32] =
- imgs[c * imgPixels * numCases + i * 32];
- }
- }
- }
- }
-}
-/*
- * Horizontal reflection.
- * imgs: (numColors, imgSize, imgSize, numCases)
- * targets: (numColors, imgSize, imgSize, numCases)
- */
-void convReflectHorizontal(NVMatrix& images, NVMatrix& targets, int imgSize) {
- int numCases = images.getNumCols();
- int imgPixels = imgSize * imgSize;
- int numColors = images.getNumRows() / imgPixels;
- assert(numColors * imgPixels == images.getNumRows());
- assert(numColors > 0 && numColors <= 3);
-
- targets.resize(images);
- int imgsPerThread = numCases % 128 == 0 ? 4 : numCases % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numCases % (32 * imgsPerThread) != 0;
- dim3 threads(32, 4);
- dim3 blocks(DIVUP(numCases, imgsPerThread * 32), DIVUP(imgPixels, 4));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (checkCaseBounds) {
- if (numColors == 1) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<1, 1, true>, cudaFuncCachePreferL1);
- kReflectH<1, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<1, 2, true>, cudaFuncCachePreferL1);
- kReflectH<1, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<1, 4, true>, cudaFuncCachePreferL1);
- kReflectH<1, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- } else if (numColors == 2) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<2, 1, true>, cudaFuncCachePreferL1);
- kReflectH<2, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<2, 2, true>, cudaFuncCachePreferL1);
- kReflectH<2, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<2, 4, true>, cudaFuncCachePreferL1);
- kReflectH<2, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- } else if (numColors == 3) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<3, 1, true>, cudaFuncCachePreferL1);
- kReflectH<3, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<3, 2, true>, cudaFuncCachePreferL1);
- kReflectH<3, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<3, 4, true>, cudaFuncCachePreferL1);
- kReflectH<3, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- }
- } else {
- if (numColors == 1) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<1, 1, false>, cudaFuncCachePreferL1);
- kReflectH<1, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<1, 2, false>, cudaFuncCachePreferL1);
- kReflectH<1, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<1, 4, false>, cudaFuncCachePreferL1);
- kReflectH<1, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- } else if (numColors == 2) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<2, 1, false>, cudaFuncCachePreferL1);
- kReflectH<2, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<2, 2, false>, cudaFuncCachePreferL1);
- kReflectH<2, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<2, 4, false>, cudaFuncCachePreferL1);
- kReflectH<2, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- } else if (numColors == 3) {
- if (imgsPerThread == 1) {
- cudaFuncSetCacheConfig(kReflectH<3, 1, false>, cudaFuncCachePreferL1);
- kReflectH<3, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 2) {
- cudaFuncSetCacheConfig(kReflectH<3, 2, false>, cudaFuncCachePreferL1);
- kReflectH<3, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- } else if (imgsPerThread == 4) {
- cudaFuncSetCacheConfig(kReflectH<3, 4, false>, cudaFuncCachePreferL1);
- kReflectH<3, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(), targets.getDevData(), imgSize, numCases);
- }
- }
- }
- getLastCudaError("kReflectH: kernel execution failed");
-}
-
-/*
- * blockIdx.y determines module in batches of B_Y
- * blockIdx.x determines filter in batches of B_X * filtersPerThread
- *
- * weights: (numModules, numColors, filterPixels, numFilters)
- * Not fully coalesced if B_X < 32, so use cache.
- */
-template <int B_Y, int B_X, int filtersPerThread>
-__global__ void kNormalizeLCWeights(
- float* weights,
- const uint numFilters,
- const int numModules,
- const uint weightsPerFilter,
- const float norm) {
- const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y;
- const uint filterIdx = B_X * blockIdx.x + threadIdx.x;
-
- float prod[filtersPerThread];
-#pragma unroll
- for (uint i = 0; i < filtersPerThread; ++i) {
- prod[i] = 0;
- }
- if (moduleIdx < numModules) {
- weights += moduleIdx * weightsPerFilter * numFilters + filterIdx;
- for (uint p = 0; p < weightsPerFilter; ++p) {
-#pragma unroll
- for (uint i = 0; i < filtersPerThread; ++i) {
- prod[i] += square(weights[p * numFilters + i * B_X]);
- }
- }
-
-#pragma unroll
- for (uint i = 0; i < filtersPerThread; ++i) {
- prod[i] = sqrtf(prod[i]);
- prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f;
- }
-
- for (uint p = 0; p < weightsPerFilter; ++p) {
-#pragma unroll
- for (uint i = 0; i < filtersPerThread; ++i) {
- weights[p * numFilters + i * B_X] *= prod[i];
- }
- }
- }
-}
-
-/*
- * weights: (numModules, numColors, filterPixels, numFilters)
- */
-void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) {
- int numFilters = weights.getNumCols();
- int weightsPerFilter = weights.getNumRows() / numModules;
- assert(numModules * weightsPerFilter == weights.getNumRows());
-
- assert(!weights.isTrans());
- assert(weights.isContiguous());
- assert(numFilters % 16 == 0);
-
- int bx = numFilters % 32 == 0 ? 32 : 16;
- int by = bx == 32 ? 4 : 8;
-
- int filtersPerThread =
- numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1;
- dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by));
- dim3 threads(bx, by);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (filtersPerThread == 4) {
- cudaFuncSetCacheConfig(
- kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1);
- kNormalizeLCWeights<4, 32, 4><<<blocks, threads, 0, stream>>>(
- weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
- } else if (filtersPerThread == 2) {
- cudaFuncSetCacheConfig(
- kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1);
- kNormalizeLCWeights<4, 32, 2><<<blocks, threads, 0, stream>>>(
- weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
- } else {
- if (numFilters % 32 == 0) {
- cudaFuncSetCacheConfig(
- kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1);
- kNormalizeLCWeights<4, 32, 1><<<blocks, threads, 0, stream>>>(
- weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
- } else {
- cudaFuncSetCacheConfig(
- kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1);
- kNormalizeLCWeights<8, 16, 1><<<blocks, threads, 0, stream>>>(
- weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
- }
- }
-}
-
-/*
- * Block size 4x32
- * blockIdx.x determines img idx in batches of 32*imgsPerThread
- * blockIdx.y determines channel idx, pixel idx in batches of 4
- *
- * threadIdx.x determins case idx
- * threadIdx.y determines pixel idx
- *
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * target: (numChannels, tgtPixels, numImages)
- */
-template <int imgsPerThread, bool checkCaseBounds>
-__global__ void kCrop(
- float* imgs,
- float* target,
- const uint numImages,
- const int imgStride,
- const uint imgSize,
- const uint tgtSize,
- const uint startY,
- const uint startX) {
- const uint imgPixels = imgSize * imgSize;
- const uint tgtPixels = tgtSize * tgtSize;
- const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
- const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4);
- const uint tgtPixelIdx = 4 * (blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y;
- const uint tgtPxY = tgtPixelIdx / tgtSize;
- const uint tgtPxX = tgtPixelIdx % tgtSize;
- const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX;
-
- if (tgtPixelIdx < tgtPixels) {
- imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx;
- target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx;
-
-#pragma unroll
- for (uint i = 0; i < imgsPerThread; ++i) {
- if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) {
- target[i * 32] = imgs[i * 32];
- }
- }
- }
-}
-
-/*
- * Block size 4x32
- * blockIdx.y determines pixel idx in batches of 4
- * blockIdx.x determines case idx in batches of 32*imgsPerThread
- * threadIdx.y determines pixel idx
- * threadIdx.x determines case idx
- *
- * imgs: (3, imgPixels, numImages) with given imgStride
- * target: (3, imgPixels, numImages)
- *
- * Each thread produces (y,u,v) values for a particular (r,g,b) pixel
- *
- * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV):
- *
- * [Y] [ 0.2126 0.7152 0.0722 ][R]
- * [U] = [-0.09991 -0.33609 0.436 ][G]
- * [V] [ 0.615 -0.55861 -0.05639][B]
- */
-template <int imgsPerThread, bool checkCaseBounds>
-__global__ void kRGBToYUV(
- float* imgs,
- float* target,
- const int imgPixels,
- const int numImages,
- const int imgStride) {
- const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
- const int pxIdx = blockIdx.y * 4 + threadIdx.y;
-
- if (pxIdx < imgPixels) {
- const int imgChannelStride = imgPixels * imgStride;
- const int tgtChannelStride = imgPixels * numImages;
- imgs += pxIdx * imgStride + caseIdx;
- target += pxIdx * numImages + caseIdx;
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; ++i) {
- if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
- const float R = imgs[0 * imgChannelStride + i * 32];
- const float G = imgs[1 * imgChannelStride + i * 32];
- const float B = imgs[2 * imgChannelStride + i * 32];
- target[0 * tgtChannelStride + i * 32] =
- 0.2126f * R + 0.7152f * G + 0.0722f * B; // Y
- target[1 * tgtChannelStride + i * 32] =
- -0.09991f * R + -0.33609f * G + 0.436f * B; // U
- target[2 * tgtChannelStride + i * 32] =
- 0.615f * R + -0.55861f * G + -0.05639f * B; // V
- }
- }
- }
-}
-
-__device__ inline float labf(const float x) {
- if (x > 0.0088564517f) {
- return __powf(x, 0.3333f);
- }
- return 7.787037f * x + 0.13793103f;
-}
-
-/*
- * Block size 4x32
- * blockIdx.y determines pixel idx in batches of 4
- * blockIdx.x determines case idx in batches of 32*imgsPerThread
- * threadIdx.y determines pixel idx
- * threadIdx.x determines case idx
- *
- * imgs: (3, imgPixels, numImages) with given imgStride
- * target: (3, imgPixels, numImages)
- *
- * This proceeds in two steps.
- *
- * - First, RGB values are linearly transformed to XYZ as per
- * http://en.wikipedia.org/wiki/CIE_XYZ_color_space
- * - Second, XYZ values are nonlinearly transformed to L*a*b* as per
- * http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation
- *
- * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel
- *
- * The RGB --> XYZ transform is:
- *
- * [X] [0.49 0.31 0.2 ][R]
- * [Y] = 5.6506753 * [0.17697 0.8124 0.01063 ][G]
- * [Z] [0 0.01 0.99 ][B]
- *
- * NOTE: The input should be in the range 0-1. Don't do mean-subtraction
- * beforehand.
- *
- * Then X_max, Y_max, Z_max = 5.6506753.
- *
- * The range of the L* values is [0, 100].
- * If the center flag is given, the range will be [-50, 50].
- *
- */
-template <int imgsPerThread, bool checkCaseBounds, bool center>
-__global__ void kRGBToLAB(
- float* imgs,
- float* target,
- const int imgPixels,
- const int numImages,
- const int imgStride) {
- const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
- const int pxIdx = blockIdx.y * 4 + threadIdx.y;
-
- if (pxIdx < imgPixels) {
- const int imgChannelStride = imgPixels * imgStride;
- const int tgtChannelStride = imgPixels * numImages;
- imgs += pxIdx * imgStride + caseIdx;
- target += pxIdx * numImages + caseIdx;
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; ++i) {
- if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
- const float R = imgs[0 * imgChannelStride + i * 32];
- const float G = imgs[1 * imgChannelStride + i * 32];
- const float B = imgs[2 * imgChannelStride + i * 32];
-
- const float X = (0.49f * R + 0.31f * G + 0.2f * B);
- const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B);
- const float Z = (0.01f * G + 0.99f * B);
-
- const float labX = labf(X);
- const float labY = labf(Y);
- const float labZ = labf(Z);
-
- target[0 * tgtChannelStride + i * 32] =
- 116.0f * labY - 16.0f - (center ? 50.0f : 0); // L*
- target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a*
- target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b*
- }
- }
- }
-}
-
-/*
- * Block size 16x32.
- * Each block produces a 4x4 chunk of the output image.
- * threadIdx.y determines pixel idx in 4x4 chunk.
- * threadIdx.x determines case idx.
- * blockIdx.x determines case idx in batches of 32*imgsPerThread.
- * blockIdx.y determines 4x4 chunk idx, channel idx.
- *
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * target: (numChannels, tgtPixels, numImages)
- *
- * imgSize = scale * tgtSize (roughly)
- *
- * This is a rather naive kernel that relies on cache for speed. But all it's
- * doing is basic texture manipulation, which is very local in nature, so it
- * should be ok. Also, it will in practice be a tiny fraction of the runtime of
- * a large convnet.
- *
- * So that is my justification for being lazy here.
- */
-template <int imgsPerThread, bool checkCaseBounds>
-__global__ void kResizeBilinear(
- float* imgs,
- float* target,
- const int imgSize,
- const int tgtSize,
- const int numImages,
- const int imgStride,
- const float scale,
- const float centerScale) {
- const int numChunksX = DIVUP(tgtSize, 4);
- const int numChunks = numChunksX * numChunksX;
- const int channelIdx = blockIdx.y / numChunks;
- const int chunkIdx = blockIdx.y % numChunks;
- const int chunkIdxX = chunkIdx % numChunksX;
- const int chunkIdxY = chunkIdx / numChunksX;
- const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
- const int imgPixels = imgSize * imgSize;
- const int tgtPixels = tgtSize * tgtSize;
-
- const int pxX = 4 * chunkIdxX + threadIdx.y % 4;
- const int pxY = 4 * chunkIdxY + threadIdx.y / 4;
-
- if (pxY < tgtSize && pxX < tgtSize) {
- const int pxIdx = pxY * tgtSize + pxX;
-
- imgs += channelIdx * imgPixels * imgStride + caseIdx;
- target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx;
-
- // This will cause slight distortions at the edges when upsampling in some
- // cases. But I think that's not a big deal.
- const float srcPxX = fmaxf(
- 0.0f,
- fminf(
- __int2float_rn(imgSize) - 1.01f,
- __int2float_rn(pxX) * scale + centerScale));
- const float srcPxY = fmaxf(
- 0.0f,
- fminf(
- __int2float_rn(imgSize) - 1.01f,
- __int2float_rn(pxY) * scale + centerScale));
-
- const float u = floorf(srcPxX + 1) - srcPxX;
- const float w = srcPxY - floorf(srcPxY);
-
- // Consider doing max(0, min(imgSize, x)) here
- const int srcPx0 =
- (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left
- const int srcPx1 = srcPx0 + 1; // top-right
- const int srcPx2 = srcPx0 + imgSize; // bottom-left
- const int srcPx3 = srcPx2 + 1; // bottom-right
-
-#pragma unroll
- for (int c = 0; c < imgsPerThread; ++c) {
- if (!checkCaseBounds || caseIdx + c * 32 < numImages) {
- const float val0 = imgs[srcPx0 * imgStride + c * 32];
- const float val1 = imgs[srcPx1 * imgStride + c * 32];
- const float val2 = imgs[srcPx2 * imgStride + c * 32];
- const float val3 = imgs[srcPx3 * imgStride + c * 32];
-
- const float c0 = u * (val0 - val1) + val1;
- const float c1 = u * (val2 - val3) + val3;
-
- target[32 * c] = w * (c1 - c0) + c0;
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X.
- * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx
- * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx
- *
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * filter: (1, 2*radius + 1)
- * target: (numChannels, imgPixels, numImages)
- *
- * target can be the same matrix as imgs.
- * radius must be one of 3, 5, 7, 9.
- *
- * Tried imgsPerThread, slower.
- */
-template <int B_Y, int B_X, int radius>
-__global__ void kGaussianBlur(
- float* imgs,
- float* filter,
- float* target,
- const int imgSize,
- const int numImages,
- const int imgStride,
- const int numChannels,
- const bool horiz,
- const float scaleTargets,
- const float scaleOutputs) {
- const int filterWidth = 2 * radius + 1;
- __shared__ float shFilter[filterWidth - 1];
-
- const int imgPixels = imgSize * imgSize;
- const int ty = B_Y * blockIdx.y + threadIdx.y;
- const int channelIdx = ty / imgSize;
- const int rowIdx = ty % imgSize;
- const int imgIdx = B_X * blockIdx.x + threadIdx.x;
-
- // const int tidx = B_Y * threadIdx.y + threadIdx.x;
- if (horiz) {
- imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride +
- imgIdx;
- target += channelIdx * imgPixels * numImages +
- rowIdx * imgSize * numImages + imgIdx;
- } else {
- imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx;
- target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx;
- }
- float outputs[filterWidth - 1];
-#pragma unroll
- for (int r = 0; r < filterWidth - 1; r++) {
- outputs[r] = 0;
- }
- if (threadIdx.x < filterWidth - 1) {
- shFilter[threadIdx.x] = filter[threadIdx.x];
- }
- __syncthreads();
-
- if (imgIdx < numImages && channelIdx < numChannels) {
-// This writes radius*2 = filterWidth - 1 values to outputs
-#pragma unroll
- for (int col = 0; col < radius; col++) {
- float px = imgs[0];
-#pragma unroll
- for (int r = 0; r < radius + 1 + col; r++) {
- outputs[r] += px * shFilter[radius + col - r];
- }
- imgs += horiz ? imgStride : imgStride * imgSize;
- }
-
- // Unfortunately this has to be at this level of granularity
- if (scaleTargets != 0) {
- for (int col = radius; col < imgSize; col++) { // loop over img columns
- float px = imgs[0];
- target[0] = scaleTargets * target[0] +
- scaleOutputs * (outputs[0] + px * shFilter[0]);
-
-#pragma unroll
- for (int r = 1; r < radius * 2; r++) {
- outputs[r - 1] = outputs[r] + px * shFilter[r];
- }
- outputs[filterWidth - 2] = px * shFilter[0];
-
- imgs += horiz ? imgStride : imgStride * imgSize;
- target += horiz ? numImages : numImages * imgSize;
- }
-
-#pragma unroll
- for (int r = 0; r < radius; r++) {
- float* t = &target[0];
- t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r];
- target += horiz ? numImages : numImages * imgSize;
- }
- } else {
- for (int col = radius; col < imgSize; col++) { // loop over img columns
- float px = imgs[0];
- target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]);
-#pragma unroll
- for (int r = 1; r < radius * 2; r++) {
- outputs[r - 1] = outputs[r] + px * shFilter[r];
- }
- outputs[filterWidth - 2] = px * shFilter[0];
-
- imgs += horiz ? imgStride : imgStride * imgSize;
- target += horiz ? numImages : numImages * imgSize;
- }
-
-#pragma unroll
- for (int r = 0; r < radius; r++) {
- target[0] = scaleOutputs * outputs[r];
- target += horiz ? numImages : numImages * imgSize;
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
- *
- * So each block does one output for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numChannels, imgPixels, numImages)
- * target: (numChannels, numOutputs, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by filtersPerThread
- */
-
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int chansPerThread,
- bool checkCaseBounds>
-__global__ void kBedOfNails(
- float* imgs,
- float* target,
- const int imgSize,
- const int numChannels,
- const int numImages,
- const int startX,
- const int strideX,
- const int outputsX,
- const bool reverse,
- const float scaleTargets,
- const float scaleOutput) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numChanBlocks = DIVUP(numChannels, B_Y * chansPerThread);
- const int outputIdxX = blockIdx.x / numImgBlocks;
- const int outputIdxY = blockIdx.y / numChanBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread;
- const int myChanIdx = (blockChanIdx + threadIdx.y * chansPerThread);
- if (myChanIdx >= numChannels) {
- return;
- }
- // if (blockIdx.x != 0 || blockIdx.y != 0) {
- // return;
- // }
- const int outputIdx = outputIdxY * outputsX + outputIdxX;
- const int numOutputs = outputsX * outputsX;
- const int imgPixels = imgSize * imgSize;
-
- const int startImgPxX = startX + outputIdxX * strideX;
- const int startImgPxY = startX + outputIdxY * strideX;
- const int imgIdx = blockImgIdx + threadIdx.x;
- const int imgPx = startImgPxY * imgSize + startImgPxX;
-
- imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx;
- target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx;
-
- if (scaleTargets != 0) {
- if (!reverse) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < chansPerThread; c++) {
- target[c * numOutputs * numImages + i * B_X] =
- scaleTargets * target[c * numOutputs * numImages + i * B_X] +
- scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < chansPerThread; c++) {
- imgs[c * imgPixels * numImages + i * B_X] =
- scaleTargets * imgs[c * imgPixels * numImages + i * B_X] +
- scaleOutput * target[c * numOutputs * numImages + i * B_X];
- }
- }
- }
- }
- } else {
- if (!reverse) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < chansPerThread; c++) {
- target[c * numOutputs * numImages + i * B_X] =
- scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < chansPerThread; c++) {
- imgs[c * imgPixels * numImages + i * B_X] =
- scaleOutput * target[c * numOutputs * numImages + i * B_X];
- }
- }
- }
- }
- }
-}
-
-/*
- * imgs: (numChannels, imgPixels, numImages)
- * target: (numChannels, outputs, numImages)
- */
-void _convBedOfNails(
- NVMatrix& images,
- NVMatrix& target,
- int numChannels,
- int imgSize,
- int startX,
- int strideX,
- bool reverse,
- float scaleTargets,
- float scaleOutput) {
- int numImages = reverse ? target.getNumCols() : images.getNumCols();
- int imgPixels = imgSize * imgSize;
-
- assert(!images.isTrans());
- assert(!target.isTrans());
- assert(images.isContiguous());
- assert(target.isContiguous());
- assert(strideX > 1);
-
- int outputsX = DIVUP(imgSize, strideX);
- int outputs = outputsX * outputsX;
- if (reverse) {
- assert(target.getNumRows() == numChannels * outputs);
- } else {
- assert(images.getNumRows() == numChannels * imgPixels);
- }
-
- if (scaleTargets == 0) {
- if (reverse) {
- images.resize(numChannels * imgPixels, numImages);
- images.apply(NVMatrixOps::Zero());
- } else {
- target.resize(numChannels * outputs, numImages);
- }
- } else {
- if (reverse) {
- assert(images.getNumRows() == numChannels * outputs);
- assert(images.getNumCols() == numImages);
- } else {
- assert(target.getNumRows() == numChannels * outputs);
- assert(target.getNumCols() == numImages);
- }
- }
-
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- int chansPerThread = numChannels % 8 == 0 ? 2 : 1;
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, 32 * imgsPerThread) * outputsX,
- DIVUP(numChannels, 4 * chansPerThread) * outputsX);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (imgsPerThread == 4) {
- if (chansPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 4, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 4, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (imgsPerThread == 2) {
- if (chansPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 2, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 2, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 2, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 2, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (chansPerThread == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 1, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 1, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 1, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
- kBedOfNails<4, 32, 1, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- numChannels,
- numImages,
- startX,
- strideX,
- outputsX,
- reverse,
- scaleTargets,
- scaleOutput);
- }
- }
- }
-}
-
-void convBedOfNails(
- NVMatrix& images,
- NVMatrix& target,
- int numChannels,
- int imgSize,
- int startX,
- int strideX,
- float scaleTargets,
- float scaleOutput) {
- _convBedOfNails(
- images,
- target,
- numChannels,
- imgSize,
- startX,
- strideX,
- false,
- scaleTargets,
- scaleOutput);
-}
-
-void convBedOfNailsUndo(
- NVMatrix& actsGrad,
- NVMatrix& target,
- int numChannels,
- int imgSize,
- int startX,
- int strideX,
- float scaleTargets,
- float scaleOutput) {
- _convBedOfNails(
- target,
- actsGrad,
- numChannels,
- imgSize,
- startX,
- strideX,
- true,
- scaleTargets,
- scaleOutput);
-}
-
-/*
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * filter: (1, 2*radius + 1)
- * target: (numChannels, imgPixels, numImages)
- */
-void convGaussianBlur(
- NVMatrix& images,
- NVMatrix& filter,
- NVMatrix& target,
- bool horiz,
- int numChannels,
- float scaleTargets,
- float scaleOutputs) {
- int numImages = images.getNumCols();
- int radius = filter.getNumCols() / 2;
- int imgPixels = images.getNumRows() / numChannels;
- int imgSize = int(sqrt(imgPixels));
-
- assert(imgPixels == imgSize * imgSize);
- assert(radius >= 1 && radius <= 4);
- assert(imgSize >= 2 * radius + 1);
- assert(filter.getNumRows() == 1);
- assert(images.getNumRows() == numChannels * imgPixels);
- assert(!images.isTrans());
- assert(!filter.isTrans());
- assert(!target.isTrans());
- assert(target.isContiguous());
- if (scaleTargets == 0) {
- target.resize(images);
- } else {
- assert(target.isSameDims(images));
- }
-
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, threads.x), DIVUP(numChannels * imgSize, threads.y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (radius == 1) {
- cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1);
- kGaussianBlur<4, 32, 1><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- filter.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- images.getStride(),
- numChannels,
- horiz,
- scaleTargets,
- scaleOutputs);
-
- } else if (radius == 2) {
- cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1);
- kGaussianBlur<4, 32, 2><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- filter.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- images.getStride(),
- numChannels,
- horiz,
- scaleTargets,
- scaleOutputs);
-
- } else if (radius == 3) {
- cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1);
- kGaussianBlur<4, 32, 3><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- filter.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- images.getStride(),
- numChannels,
- horiz,
- scaleTargets,
- scaleOutputs);
- } else if (radius == 4) {
- cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1);
- kGaussianBlur<4, 32, 4><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- filter.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- images.getStride(),
- numChannels,
- horiz,
- scaleTargets,
- scaleOutputs);
- }
-}
-
-/*
- * Block size 1x128
- * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread
- * blockIdx.y determines pixel.y
- *
- * So each block does one output for some number of images and all the fliters.
- *
- * threadIdx.x determines img idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * meanDiffs: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by B_Y*filtersPerThread
- */
-
-template <int imgsPerThread, int numFilters, bool checkCaseBounds>
-__global__ void kCNorm_fewfilter(
- float* imgs,
- float* meanDiffs,
- float* denoms,
- float* target,
- const int imgSize,
- const int numImages,
- const int sizeX,
- const float addScale,
- const float powScale,
- const float minDiv) {
- const int imgPixels = imgSize * imgSize;
- const int numImgBlocks = DIVUP(numImages, 128 * imgsPerThread);
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread;
-
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
-
- const int startPxX = -sizeX / 2 + pxIdxX;
- const int startPxY = -sizeX / 2 + pxIdxY;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs += pxIdx * numImages + imgIdx;
- denoms += pxIdx * numImages + imgIdx;
- meanDiffs += imgIdx;
- target += pxIdx * numImages + imgIdx;
-
- float prod[numFilters][imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
-#pragma unroll
- for (int f = 0; f < numFilters; f++) {
- prod[f][i] = 0;
- }
- }
- }
- const int loopStartY = MAX(0, startPxY);
- const int loopStartX = MAX(0, startPxX);
- const int loopEndY = MIN(imgSize, startPxY + sizeX);
- const int loopEndX = MIN(imgSize, startPxX + sizeX);
-
- for (int y = loopStartY; y < loopEndY; y++) {
- for (int x = loopStartX; x < loopEndX; x++) {
- const int imgPx = y * imgSize + x;
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
-#pragma unroll
- for (int f = 0; f < numFilters; f++) {
- prod[f][i] += square(
- meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]);
- }
- }
- }
- }
- }
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
-#pragma unroll
- for (int f = 0; f < numFilters; f++) {
- prod[f][i] = minDiv + addScale * prod[f][i];
- denoms[f * imgPixels * numImages + i * 128] = prod[f][i];
- target[f * imgPixels * numImages + i * 128] =
- imgs[f * imgPixels * numImages + i * 128] *
- __powf(prod[f][i], -powScale);
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines filter idx in batches of B_Y*filtersPerThread
- * blockIdx.z determines pixel
- *
- * So each block does one pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * means: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by B_Y*filtersPerThread
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool checkCaseBounds>
-__global__ void kCNorm_manyfilter(
- float* imgs,
- float* meanDiffs,
- float* denoms,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeX,
- const float addScale,
- const float powScale,
- const float minDiv) {
- const int imgPixels = imgSize * imgSize;
-
- const int pxIdxX = blockIdx.z % imgSize;
- const int pxIdxY = blockIdx.z / imgSize;
- const int blockImgIdx = blockIdx.x * B_X * imgsPerThread;
- const int blockFilterIdx = blockIdx.y * B_Y * filtersPerThread;
-
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
-
- const int startPxX = -sizeX / 2 + pxIdxX;
- const int startPxY = -sizeX / 2 + pxIdxY;
- const int imgIdx = blockImgIdx + threadIdx.x;
- imgs +=
- ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
- meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx;
- denoms +=
- ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
- target +=
- ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] = 0;
- }
- }
- }
-
- const int loopStartY = max(0, startPxY);
- const int loopStartX = max(0, startPxX);
- const int loopEndY = min(imgSize, startPxY + sizeX);
- const int loopEndX = min(imgSize, startPxX + sizeX);
-
- for (int y = loopStartY; y < loopEndY; y++) {
- for (int x = loopStartX; x < loopEndX; x++) {
- const int imgPx = y * imgSize + x;
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[f][i] += square(
- meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
- }
- }
- }
- }
- }
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[f][i] = minDiv + addScale * prod[f][i];
- denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
- target[f * B_Y * imgPixels * numImages + i * B_X] =
- imgs[f * B_Y * imgPixels * numImages + i * B_X] *
- __powf(prod[f][i], -powScale);
- }
- }
- }
-}
-
-/*
- * Block size 16xB_X
- * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
- * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
- * batches of filtersPerThread
- *
- * So each block does 4x4 region of pixels for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines pixel idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * means: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- *
- * B_X one of 8, 16, 32
- * imgsPerThread one of 1, 2, 4, 8, 16
- *
- * B_XximgsPerThread MUST be divisible by 32.
- * Number of filters MUST be divisible by filtersPerThread.
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by filtersPerThread
- *
- * Final write-out will not be fully coalesced unless B_X is 32. But there's a
- * lot more reading than writing here, and the reading is all coalesced, so it
- * should be OK.
- */
-template <
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool checkCaseBounds>
-__global__ void kCNorm2(
- float* imgs,
- float* meanDiffs,
- float* denoms,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeX,
- const float addScale,
- const float powScale,
- const float minDiv) {
- __shared__ float shDiffs[filtersPerThread][B_X * imgsPerThread];
- const int imgPixels = imgSize * imgSize;
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / (filtersPerThread);
- const int blockPxX = 4 * (blockIdx.x / numImgBlocks);
- const int blockPxY = 4 * (blockIdx.y / numFilterBlocks);
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
-
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- const int loadY = tidx / 32, loadX = tidx % 32;
-
- const int startPxX = MAX(0, -sizeX / 2 + blockPxX);
- const int startPxY = MAX(0, -sizeX / 2 + blockPxY);
- const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3);
- const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3);
-
- const int myPxX = blockPxX + threadIdx.y % 4;
- const int myPxY = blockPxY + threadIdx.y / 4;
- const int myPxIdx = myPxY * imgSize + myPxX;
- // const bool doWork = myPxX < imgSize && myPxY < imgSize;
- const int myStartPxY = -sizeX / 2 + myPxY;
- const int myStartPxX = -sizeX / 2 + myPxX;
- const int myEndPxY = myPxY + DIVUP(sizeX, 2);
- const int myEndPxX = myPxX + DIVUP(sizeX, 2);
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
- meanDiffs +=
- (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
- denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
- target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] = 0;
- }
- }
- }
-
- for (int y = startPxY; y < endPxY; y++) {
- const bool isInY = y >= myStartPxY && y < myEndPxY;
- for (int x = startPxX; x < endPxX; x++) {
- const int px = y * imgSize + x;
-// All the threads load a pixel from memory
-#pragma unroll
- for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
- if (filtersPerThread % (B_X / 2) == 0 ||
- ly + loadY < filtersPerThread) {
-#pragma unroll
- for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
- if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
- shDiffs[ly + loadY][lx + loadX] =
- meanDiffs[(ly * imgPixels + px) * numImages + lx];
- }
- }
- }
- }
- __syncthreads();
-
- // Each row of threads decides if it's interested in this pixel
- if (isInY && x >= myStartPxX && x < myEndPxX) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]);
- }
- }
- }
- }
- __syncthreads();
- }
- }
- // imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
- // imgs += threadIdx.x;
- if (myPxX < imgSize && myPxY < imgSize) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] = minDiv + addScale * prod[f][i];
- denoms[f * imgPixels * numImages + i * B_X] = prod[f][i];
- target[f * imgPixels * numImages + i * B_X] =
- imgs[f * imgPixels * numImages + i * B_X] *
- __powf(prod[f][i], -powScale);
- }
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y
- *
- * So each block does one pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * meanDiffs: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by B_Y
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- bool checkCaseBounds,
- bool blocked>
-__global__ void kFCNorm(
- cudaTextureObject_t imgs,
- cudaTextureObject_t meanDiffs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeF,
- const float addScale,
- const float powScale,
- const float minDiv) {
- const int imgPixels = imgSize * imgSize;
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / B_Y;
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y / numFilterBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
-
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
-
- const int imgIdx = blockImgIdx + threadIdx.x;
- const int imgOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
- const int meanDiffsOffset = pxIdx * numImages + imgIdx;
- // imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
- // meanDiffs += pxIdx * numImages + imgIdx;
- target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
-
- float prod[imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] = 0;
- }
- }
-
- const int startF =
- blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx;
- const int loopStartF = blocked ? startF : MAX(0, startF);
- const int loopEndF = MIN(numFilters, startF + sizeF);
-
- for (int f = loopStartF; f < loopEndF; ++f) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] += square(tex1Dfetch<float>(
- meanDiffs, meanDiffsOffset + f * imgPixels * numImages + i * B_X));
- }
- }
- }
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] = minDiv + addScale * prod[i];
- target[i * B_X] = tex1Dfetch<float>(imgs, imgOffset + i * B_X) *
- __powf(prod[i], -powScale);
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * maxGrads: (numOutputs, imgPixels, numImages)
- * maxActs: (numOutputs, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y
- *
- * TODO: this isn't really ideal
- */
-template <int B_Y, int B_X, int imgsPerThread, bool add, bool checkCaseBounds>
-__global__ void kCrossMapMaxPoolUndo(
- float* imgs,
- float* maxGrads,
- float* maxActs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int startF,
- const int poolSize,
- const int numOutputs,
- const int stride,
- const float scaleTargets,
- const float scaleOutputs) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- // const int numOutputs = DIVUP(numFilters, stride);
- const int numFilterBlocks = numFilters / B_Y;
-
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y / numFilterBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
-
- const int imgPixels = imgSize * imgSize;
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
- maxGrads += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx;
- maxActs += (/*(filterIdx) * imgPixels +*/ pxIdx) * numImages + imgIdx;
- target += ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
-
- float prod[imgsPerThread];
- // if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) {
- // return;
- // }
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[i] = 0;
- }
-
- if (filterIdx < numFilters) {
- // const int startOut = max(0, (filterIdx-startF-poolSize)/ stride +
- // 1);
- const int loopStartOut =
- max(0, (filterIdx - startF - poolSize) / stride + 1);
- const int loopEndOut = min(numOutputs, (filterIdx - startF) / stride + 1);
-
- for (int o = loopStartOut; o < loopEndOut; ++o) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- const float ma = maxActs[o * imgPixels * numImages + i * B_X];
- const float mg = maxGrads[o * imgPixels * numImages + i * B_X];
- const float img = imgs[i * B_X];
- prod[i] += (img == ma) * mg;
- }
- }
- }
- // printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF);
-
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- target[i * B_X] = prod[i];
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- target[i * B_X] =
- scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
- }
- }
- }
- }
-}
-
-/*
- * images: (numFilters, imgPixels, numImages)
- * maxGrads: (numOutputs, imgPixels, numImages)
- * maxActs: (numOutputs, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- */
-void convCrossMapMaxPoolUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- const int imgSize,
- const int startF,
- const int poolSize,
- const int stride,
- const float scaleTargets,
- const float scaleOutputs) {
- int numImages = images.getNumCols();
- int imgPixels = imgSize * imgSize;
- int numFilters = images.getNumRows() / imgPixels;
- int numOutputs = maxActs.getNumRows() / imgPixels;
- assert(images.getNumRows() == numFilters * imgPixels);
- assert(maxGrads.getNumRows() == numOutputs * imgPixels);
- assert(maxGrads.getNumCols() == numImages);
- assert(maxGrads.isSameDims(maxActs));
-
- assert(images.getNumRows() == numFilters * imgPixels);
-
- assert(!images.isTrans());
- assert(!target.isTrans());
- assert(!maxGrads.isTrans());
- assert(!maxActs.isTrans());
- assert(images.isContiguous());
- assert(maxGrads.isContiguous());
- assert(maxActs.isContiguous());
- assert(maxGrads.isSameDims(maxActs));
- // assert(numFilters % 16 == 0);
- // assert(numImages % 128 == 0);
-
- assert(stride <= poolSize);
- assert(startF <= 0);
- assert(
- startF + (numOutputs - 1) * stride + poolSize >=
- numFilters); // All filters must be covered
-
- dim3 threads(32, 4);
-
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- dim3 blocks(
- imgSize * DIVUP(numImages, threads.x * imgsPerThread),
- imgSize * DIVUP(numFilters, threads.y));
- bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
-
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (scaleTargets == 0) {
- target.resize(images);
- if (!checkCaseBounds) {
- if (imgsPerThread == 4) {
- kCrossMapMaxPoolUndo<4, 32, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- } else if (imgsPerThread == 2) {
- kCrossMapMaxPoolUndo<4, 32, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- } else {
- kCrossMapMaxPoolUndo<4, 32, 1, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- }
- } else {
- kCrossMapMaxPoolUndo<4, 32, 1, false, true>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- }
- } else {
- assert(target.isSameDims(images));
- if (!checkCaseBounds) {
- if (imgsPerThread == 4) {
- kCrossMapMaxPoolUndo<4, 32, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- } else if (imgsPerThread == 2) {
- kCrossMapMaxPoolUndo<4, 32, 2, true, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- } else {
- kCrossMapMaxPoolUndo<4, 32, 1, true, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- }
- } else {
- kCrossMapMaxPoolUndo<4, 32, 1, true, true>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- startF,
- poolSize,
- numOutputs,
- stride,
- scaleTargets,
- scaleOutputs);
- }
- }
- getLastCudaError("convCrossMapMaxPoolUndo: kernel execution failed");
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y
- *
- * TODO: this isn't really ideal
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- bool add,
- bool checkCaseBounds,
- bool blocked>
-__global__ void kFRNormUndo(
- cudaTextureObject_t outGrads,
- cudaTextureObject_t denoms,
- cudaTextureObject_t inputs,
- cudaTextureObject_t acts,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeF,
- const float powScale,
- const float scaleTargets,
- const float scaleOutputs) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / B_Y;
-
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y / numFilterBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
-
- const int imgPixels = imgSize * imgSize;
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- const int actsOffset = pxIdx * numImages + imgIdx;
- const int inputOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
-
- target += inputOffset;
- float prod[imgsPerThread];
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[i] = 0;
- }
-
- const int startF = blocked ? (filterIdx / sizeF) * sizeF
- : -sizeF + sizeF / 2 + 1 + filterIdx;
- const int loopStartF = blocked ? startF : MAX(0, startF);
- const int loopEndF = MIN(numFilters, startF + sizeF);
-
- for (int f = loopStartF; f < loopEndF; ++f) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- prod[i] += tex1Dfetch<float>(
- acts, actsOffset + f * imgPixels * numImages + i * B_X);
- }
- }
- }
-
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- const float inp = tex1Dfetch<float>(inputs, inputOffset + i * B_X);
- const float out = tex1Dfetch<float>(outGrads, inputOffset + i * B_X);
- const float den = tex1Dfetch<float>(denoms, inputOffset + i * B_X);
- prod[i] = inp * prod[i] + out * __powf(den, -powScale);
- target[i * B_X] = prod[i];
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- const float inp = tex1Dfetch<float>(inputs, inputOffset + i * B_X);
- const float out = tex1Dfetch<float>(outGrads, inputOffset + i * B_X);
- const float den = tex1Dfetch<float>(denoms, inputOffset + i * B_X);
- prod[i] = inp * prod[i] + out * __powf(den, -powScale);
- target[i * B_X] =
- scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y
- *
- * TODO: this is pretty wasteful of computation. a lot of threads basically
- * compute the same products.
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- bool add,
- bool checkCaseBounds,
- bool blocked>
-//__launch_bounds__(128,16)
-__global__ void kFRNormUndo2(
- cudaTextureObject_t outGrads,
- cudaTextureObject_t inputs,
- cudaTextureObject_t acts,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeF,
- const float addScale,
- const float powScale,
- const float minDiv,
- const float scaleTargets,
- const float scaleOutputs) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / B_Y;
-
- const int pxIdxX = blockIdx.x / numImgBlocks;
- const int pxIdxY = blockIdx.y / numFilterBlocks;
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
-
- const int imgPixels = imgSize * imgSize;
- const int pxIdx = pxIdxY * imgSize + pxIdxX;
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- const int inpOffset = pxIdx * numImages + imgIdx;
- const int outOffset = ((filterIdx)*imgPixels + pxIdx) * numImages + imgIdx;
-
- target += outOffset;
-
- float prod[imgsPerThread];
- float denoms[imgsPerThread];
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[i] = 0;
- denoms[i] = 0;
- }
-
- int startF = blocked ? (filterIdx / sizeF) * sizeF
- : -sizeF + sizeF / 2 + 1 + filterIdx;
- int loopStartF = blocked ? startF : MAX(0, startF);
- int loopEndF = MIN(numFilters, startF + sizeF);
-
- for (int f = loopStartF; f < loopEndF; ++f) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- // If an input is zero, then we shuldn't divide by it.
- const float grad = tex1Dfetch<float>(
- outGrads, inpOffset + f * imgPixels * numImages + i * B_X);
- const float act = tex1Dfetch<float>(
- acts, inpOffset + f * imgPixels * numImages + i * B_X);
- const float inp =
- tex1Dfetch<float>(
- inputs, inpOffset + f * imgPixels * numImages + i * B_X) +
- (act == 0);
- prod[i] += grad * act * __powf(__fdividef(act, inp), 1.0f / powScale);
- }
- }
- }
-
- startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF / 2 + filterIdx;
- loopStartF = blocked ? startF : MAX(0, startF);
- loopEndF = MIN(numFilters, startF + sizeF);
-
- for (int f = loopStartF; f < loopEndF; ++f) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- denoms[i] += square(tex1Dfetch<float>(
- inputs, inpOffset + f * imgPixels * numImages + i * B_X));
- }
- }
- }
-
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- const float inp = tex1Dfetch<float>(inputs, outOffset + i * B_X);
- const float out = tex1Dfetch<float>(outGrads, outOffset + i * B_X);
- denoms[i] = addScale * denoms[i] + minDiv;
- prod[i] =
- (-2 * powScale * addScale * inp * prod[i] +
- out * __powf(denoms[i], -powScale));
- target[i * B_X] = prod[i];
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
- const float inp = tex1Dfetch<float>(inputs, outOffset + i * B_X);
- const float out = tex1Dfetch<float>(outGrads, outOffset + i * B_X);
- denoms[i] = addScale * denoms[i] + minDiv;
- prod[i] =
- (-2 * powScale * addScale * inp * prod[i] +
- out * __powf(denoms[i], -powScale));
- target[i * B_X] =
- scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * maxGrads: (numFilters, numOutputs, numImages)
- * rMaxActs: (numFilters, numOutputs, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y*filtersPerThread
- */
-
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool sum,
- bool add,
- bool checkCaseBounds>
-__global__ void kLocalAvgUndo(
- float* avgGrads,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int subsX,
- const int startX,
- const int strideX,
- const int outputsX,
- const float scaleTargets,
- const float scaleOutputs) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockPxX = blockIdx.x / numImgBlocks;
- const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread));
-
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx =
- (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y *
- filtersPerThread;
-
- const int blockPx = blockPxY * imgSize + blockPxX;
- const int numOutputs = outputsX * outputsX;
- const int imgPixels = imgSize * imgSize;
-
- const int startOutputY =
- blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
- const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
- const int startOutputX =
- blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
- const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- avgGrads +=
- ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
- target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = 0;
- }
- }
-
- if (blockPxX >= startX &&
- blockPxX < startX + strideX * (outputsX - 1) + subsX &&
- blockPxY >= startX &&
- blockPxY < startX + strideX * (outputsX - 1) + subsX) {
- for (int my = startOutputY; my < endOutputY; my++) {
- const float regionStartY = fmaxf(0, startX + my * strideX);
- const float regionEndY = fminf(imgSize, startX + my * strideX + subsX);
- const float regionSizeY = regionEndY - regionStartY;
- for (int mx = startOutputX; mx < endOutputX; mx++) {
- const int outputIdx = my * outputsX + mx;
- const float regionStartX = fmaxf(0, startX + mx * strideX);
- const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX);
- const float regionSizeX = regionEndX - regionStartX;
- // It's important to do the division here, because pushing division into
- // the below loops makes the code 4x slower.
- const float regionSizeInv =
- sum ? 1.0f : (1.0f / (regionSizeX * regionSizeY));
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] +=
- avgGrads
- [(f * B_Y * numOutputs + outputIdx) * numImages +
- i * B_X] *
- regionSizeInv;
- }
- }
- }
- }
- }
- }
-
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * B_Y * imgPixels * numImages + i * B_X] =
- scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[f][i];
- }
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * imgs: (numFilters, imgPixels, numImages)
- * maxGrads: (numFilters, numOutputs, numImages)
- * maxActs: (numFilters, numOutputs, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y*filtersPerThread
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool add,
- bool checkCaseBounds>
-__global__ void kLocalMaxUndo(
- float* imgs,
- float* maxGrads,
- float* maxActs,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int subsX,
- const int startX,
- const int strideX,
- const int outputsX,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImgs[B_Y * filtersPerThread][B_X * imgsPerThread];
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockPxX = blockIdx.x / numImgBlocks;
- const int blockPxY = blockIdx.y / (numFilters / (B_Y * filtersPerThread));
-
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx =
- (blockIdx.y % (numFilters / (B_Y * filtersPerThread))) * B_Y *
- filtersPerThread;
-
- const int blockPx = blockPxY * imgSize + blockPxX;
- const int numOutputs = outputsX * outputsX;
- const int imgPixels = imgSize * imgSize;
-
- const int startOutputY =
- blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
- const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
- const int startOutputX =
- blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
- const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
- maxGrads +=
- ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
- maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
-
- target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = 0;
- }
- }
-
- if (blockPxX >= startX &&
- blockPxX < startX + strideX * (outputsX - 1) + subsX &&
- blockPxY >= startX &&
- blockPxY < startX + strideX * (outputsX - 1) + subsX) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] =
- imgs[f * B_Y * imgPixels * numImages + i * B_X];
- }
- }
- }
- for (int my = startOutputY; my < endOutputY; my++) {
- for (int mx = startOutputX; mx < endOutputX; mx++) {
- const int outputIdx = my * outputsX + mx;
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- const float ma = maxActs
- [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
- const float mg = maxGrads
- [(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
- const float img =
- shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i];
-
- prod[f][i] += (img == ma) * mg;
- }
- }
- }
- }
- }
- }
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- target[f * B_Y * imgPixels * numImages + i * B_X] =
- scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[f][i];
- }
- }
- }
- }
-}
-
-/*
- * acts := -2 x scale x acts x outGrads / denoms
- */
-template <int B_X, int eltsPerThread>
-__global__ void kRNormUndoPrelims(
- float* acts,
- cudaTextureObject_t denoms,
- cudaTextureObject_t outGrads,
- const uint numElements,
- const float scale) {
- const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x;
- const uint numThreads = B_X * gridDim.x;
- for (uint i = e; i < numElements; i += numThreads * eltsPerThread) {
-#pragma unroll
- for (uint k = 0; k < eltsPerThread; k++) {
- if (i + k * B_X < numElements) {
- acts[i + k * B_X] = __fdividef(
- scale * tex1Dfetch<float>(outGrads, i + k * B_X) *
- acts[i + k * B_X],
- tex1Dfetch<float>(denoms, i + k * B_X));
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X
- * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
- * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
- *
- * So each block does one output pixel for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines filter idx
- *
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * numImages must be divisible by B_X*imgsPerThread
- * numFilters must be divisible by B_Y*filtersPerThread
- *
- * TODO: this isn't really ideal
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool checkCaseBounds>
-__global__ void kRNormUndo(
- float* outGrads,
- float* denoms,
- float* inputs,
- float* acts,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeX,
- const float powScale,
- const float scaleTargets,
- const float scaleOutputs) {
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / (B_Y * filtersPerThread);
-
- const int blockPxX = blockIdx.x / numImgBlocks;
- const int blockPxY = blockIdx.y / numFilterBlocks;
-
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx =
- (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
-
- const int blockPx = blockPxY * imgSize + blockPxX;
- const int imgPixels = imgSize * imgSize;
-
- const int startY = MAX(0, blockPxY + sizeX / 2 - sizeX + 1);
- const int startX = MAX(0, blockPxX + sizeX / 2 - sizeX + 1);
- const int endY = MIN(imgSize, blockPxY + sizeX / 2 + 1);
- const int endX = MIN(imgSize, blockPxX + sizeX / 2 + 1);
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- acts += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
- inputs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
- denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
- outGrads +=
- ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
- target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages +
- imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = 0;
- }
- }
-
- for (int sy = startY; sy < endY; sy++) {
- for (int sx = startX; sx < endX; sx++) {
- const int outPx = sy * imgSize + sx;
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] +=
- acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X];
- }
- }
- }
- }
- }
- // outGrads += blockPx * numImages;
- if (scaleTargets == 0) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
- const float out =
- outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
- const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
- prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
- target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
- const float out =
- outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
- const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
- prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
- target[f * B_Y * imgPixels * numImages + i * B_X] =
- scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[f][i];
- }
- }
- }
- }
-}
-
-/*
- * Block size 16xB_X
- * blockIdx.x determines 4x4 pixel.x region, image idx in batches of
- * B_X*imgsPerThread blockIdx.y determines 4x4 pixel.y region, filter idx in
- * batches of filtersPerThread
- *
- * So each block does 4x4 region for some number of images/filters.
- *
- * threadIdx.x determines img idx
- * threadIdx.y determines pixel idx
- *
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * B_X one of 8, 16, 32
- * imgsPerThread one of 1, 2, 4, 8, 16
- *
- * B_XximgsPerThread MUST be divisible by 32.
- * Number of filters MUST be divisible by filtersPerThread.
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
- * numFilters must be divisible by filtersPerThread
- *
- * Final write-out will not be fully coalesced unless B_X is 32. But there's a
- * lot more reading than writing here, and the reading is all coalesced, so it
- * should be OK.
- */
-template <
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- bool add,
- bool checkCaseBounds>
-__global__ void kRNormUndo2(
- float* outGrads,
- float* denoms,
- float* inputs,
- float* acts,
- float* target,
- const int imgSize,
- const int numFilters,
- const int numImages,
- const int sizeX,
- const float powScale,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shActs[filtersPerThread][B_X * imgsPerThread];
- const int imgPixels = imgSize * imgSize;
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int numFilterBlocks = numFilters / (filtersPerThread);
- const int blockPxX = 4 * (blockIdx.x / numImgBlocks);
- const int blockPxY = 4 * (blockIdx.y / numFilterBlocks);
- const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
-
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- const int loadY = tidx / 32, loadX = tidx % 32;
-
- const int startPxX = MAX(0, -DIVUP(sizeX, 2) + blockPxX + 1);
- const int startPxY = MAX(0, -DIVUP(sizeX, 2) + blockPxY + 1);
- const int endPxX = MIN(imgSize, blockPxX + sizeX / 2 + 4);
- const int endPxY = MIN(imgSize, blockPxY + sizeX / 2 + 4);
-
- const int myPxX = blockPxX + threadIdx.y % 4;
- const int myPxY = blockPxY + threadIdx.y / 4;
- const int myPxIdx = myPxY * imgSize + myPxX;
- // const bool doWork = myPxX < imgSize && myPxY < imgSize;
- const int myStartPxY = -DIVUP(sizeX, 2) + myPxY + 1;
- const int myStartPxX = -DIVUP(sizeX, 2) + myPxX + 1;
- const int myEndPxY = myPxY + sizeX / 2 + 1;
- const int myEndPxX = myPxX + sizeX / 2 + 1;
-
- const int imgIdx = blockImgIdx + threadIdx.x;
-
- acts +=
- (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
- denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
- inputs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
- outGrads += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
- target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[f][i] = 0;
- }
- }
-
- for (int y = startPxY; y < endPxY; y++) {
- const bool isInY = y >= myStartPxY && y < myEndPxY;
- for (int x = startPxX; x < endPxX; x++) {
- const int px = y * imgSize + x;
-// All the threads load a pixel from memory
-#pragma unroll
- for (int ly = 0; ly < filtersPerThread; ly += B_X / 2) {
- if (filtersPerThread % (B_X / 2) == 0 ||
- ly + loadY < filtersPerThread) {
-#pragma unroll
- for (int lx = 0; lx < B_X * imgsPerThread; lx += 32) {
- if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
- shActs[ly + loadY][lx + loadX] =
- acts[(ly * imgPixels + px) * numImages + lx];
- }
- }
- }
- }
- __syncthreads();
-
- // Each row of threads decides if it's interested in this pixel
- if (isInY && x >= myStartPxX && x < myEndPxX) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][i] += shActs[f][threadIdx.x + i * B_X];
- }
- }
- }
- }
- __syncthreads();
- }
- }
- acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
- acts += threadIdx.x;
- if (myPxX < imgSize && myPxY < imgSize) {
- if (!add) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- const float out = outGrads[f * imgPixels * numImages + i * B_X];
- const float den = denoms[f * imgPixels * numImages + i * B_X];
- const float inp = inputs[f * imgPixels * numImages + i * B_X];
- prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
- target[f * imgPixels * numImages + i * B_X] = prod[f][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- const float out = outGrads[f * imgPixels * numImages + i * B_X];
- const float den = denoms[f * imgPixels * numImages + i * B_X];
- const float inp = inputs[f * imgPixels * numImages + i * B_X];
- prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
- target[f * imgPixels * numImages + i * B_X] =
- scaleTargets * target[f * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[f][i];
- }
- }
- }
- }
- }
-}
-
-void convLocalMaxUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX) {
- convLocalMaxUndo(
- images,
- maxGrads,
- maxActs,
- target,
- subsX,
- startX,
- strideX,
- outputsX,
- 0,
- 1);
-}
-
-/*
- * imgs: (numFilters, imgPixels, numImages)
- * maxGrads: (numFilters, numOutputs, numImages)
- * rMaxActs: (numFilters, numOutputs, numImages)
- * target: (numFilters, imgPixels, numImages)
- */
-void convLocalMaxUndo(
- NVMatrix& images,
- NVMatrix& maxGrads,
- NVMatrix& maxActs,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- float scaleTargets,
- float scaleOutput) {
- int outputs = outputsX * outputsX;
- int numImages = images.getNumCols();
- int numFilters = maxGrads.getNumRows() / outputs;
- int imgPixels = images.getNumRows() / numFilters;
- assert(images.getNumRows() == numFilters * imgPixels);
- int imgSize = int(sqrt(imgPixels));
-
- assert(imgSize * imgSize == imgPixels);
- assert(maxGrads.getNumRows() == numFilters * outputs);
- assert(maxGrads.getNumCols() == numImages);
- assert(!images.isTrans());
- assert(!target.isTrans());
- assert(!maxGrads.isTrans());
- assert(!maxActs.isTrans());
- assert(images.isContiguous());
- assert(maxGrads.isContiguous());
- assert(maxActs.isContiguous());
- assert(maxGrads.isSameDims(maxActs));
- assert(numFilters % 16 == 0);
- // assert(numImages % 128 == 0);
-
- assert(strideX <= subsX);
-
- target.resize(images);
- assert(target.isContiguous());
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- int checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, 32 * imgsPerThread) * imgSize,
- (numFilters / (4 * 2)) * imgSize);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 4, 2, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 4, 2, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 4, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 4, 2, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 2, 2, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 2, 2, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 2, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 2, 2, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 1, 2, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 1, 2, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalMaxUndo<4, 32, 1, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalMaxUndo<4, 32, 1, 2, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- maxGrads.getDevData(),
- maxActs.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- }
-
- getLastCudaError("convLocalMaxUndo: kernel execution failed");
-}
-
-void convLocalAvgUndo(
- NVMatrix& avgGrads,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- int imgSize,
- bool sum) {
- convLocalAvgUndo(
- avgGrads, target, subsX, startX, strideX, outputsX, imgSize, sum, 0, 1);
-}
-
-/*
- * avgGrads: (numFilters, numOutputs, numImages)
- * target: (numFilters, imgPixels, numImages)
- */
-void convLocalAvgUndo(
- NVMatrix& avgGrads,
- NVMatrix& target,
- int subsX,
- int startX,
- int strideX,
- int outputsX,
- int imgSize,
- bool sum,
- float scaleTargets,
- float scaleOutput) {
- int numImages = avgGrads.getNumCols();
-
- int outputs = outputsX * outputsX;
- int imgPixels = imgSize * imgSize;
- int numFilters = avgGrads.getNumRows() / outputs;
- assert(avgGrads.getNumRows() == numFilters * outputs);
-
- assert(!target.isTrans());
- assert(!avgGrads.isTrans());
- assert(avgGrads.isContiguous());
- assert(numFilters % 16 == 0);
- // assert(numImages % 128 == 0);
-
- assert(strideX <= subsX);
-
- target.resize(numFilters * imgPixels, numImages);
- assert(target.isContiguous());
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- int checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, 32 * imgsPerThread) * imgSize,
- (numFilters / (4 * 4)) * imgSize);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- bool scale = !(scaleTargets == 0 && scaleOutput == 1);
- if (sum) {
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 4, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 4, 4, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 4, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 4, 4, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 2, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 2, 4, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 2, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 2, 4, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 1, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 1, 4, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 1, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 1, 4, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else {
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 4, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 4, 4, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 4, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 4, 4, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 2, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 2, 4, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 2, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 2, 4, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 1, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 1, 4, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- kLocalAvgUndo<4, 32, 1, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- } else {
- kLocalAvgUndo<4, 32, 1, 4, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- avgGrads.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- subsX,
- startX,
- strideX,
- outputsX,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
-
- getLastCudaError("convLocalAvgUndo: kernel execution failed");
-}
-
-void convResponseNorm(
- NVMatrix& images,
- NVMatrix& denoms,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float minDiv) {
- convContrastNorm(
- images,
- images,
- denoms,
- target,
- numFilters,
- sizeX,
- addScale,
- powScale,
- minDiv);
-}
-
-/*
- * images: (numFilters, imgPixels, numImages)
- * meanDiffs: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- */
-void convContrastNorm(
- NVMatrix& images,
- NVMatrix& meanDiffs,
- NVMatrix& denoms,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float minDiv) {
- int numImages = images.getNumCols();
- int imgPixels = images.getNumRows() / numFilters;
- assert(images.getNumRows() == numFilters * imgPixels);
- int imgSize = int(sqrt(imgPixels));
- assert(imgSize * imgSize == imgPixels);
- assert(meanDiffs.isSameDims(images));
-
- assert(!meanDiffs.isTrans());
- assert(!images.isTrans());
- assert(images.isContiguous());
- assert(meanDiffs.isContiguous());
- assert(numFilters % 16 == 0 || numFilters <= 8);
-
- target.resize(images);
- denoms.resize(images);
- assert(target.isContiguous());
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (sizeX >= 6 && numFilters % 4 == 0) {
- // This one is faster for large regions (my tests show regions >= 6...)
- int imgsPerThread = 8;
- int filtersPerThread = 4;
- int bx = 8;
- bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
- assert((imgsPerThread * bx) % 32 == 0);
- assert(numFilters % filtersPerThread == 0);
- dim3 threads(bx, 16);
- dim3 blocks(
- DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread),
- DIVUP(imgSize, 4) * numFilters / filtersPerThread);
-
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here
- kCNorm2<8, 8, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here
- kCNorm2<8, 8, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else {
- bool checkCaseBounds = numImages % 128 != 0;
- if (numFilters <= 8) {
- dim3 threads(128);
- dim3 blocks(DIVUP(numImages, 128) * imgSize, imgSize);
- if (numFilters == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 2) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 3) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 3, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 3, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 5) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 5, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 5, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 6) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 6, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 6, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 7) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 7, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 7, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- } else if (numFilters == 8) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 8, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1);
- kCNorm_fewfilter<1, 8, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- }
- } else {
- dim3 threads(32, 4);
- dim3 blocks(
- DIVUP(numImages, threads.x * 4),
- (numFilters / (threads.y * 2)),
- imgPixels);
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
- kCNorm_manyfilter<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
- kCNorm_manyfilter<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- meanDiffs.getDevData(),
- denoms.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- addScale,
- powScale,
- minDiv);
- }
- }
- }
- getLastCudaError("convResponseNorm: kernel execution failed");
-}
-
-void convContrastNormUndo(
- NVMatrix& outGrads,
- NVMatrix& denoms,
- NVMatrix& meanDiffs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float scaleTargets,
- float scaleOutput) {
- convResponseNormUndo(
- outGrads,
- denoms,
- meanDiffs,
- acts,
- target,
- numFilters,
- sizeX,
- addScale,
- powScale,
- scaleTargets,
- scaleOutput);
-}
-
-/*
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * THIS WILL OVERWRITE THE ACTS MATRIX.
- */
-void convResponseNormUndo(
- NVMatrix& outGrads,
- NVMatrix& denoms,
- NVMatrix& inputs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeX,
- float addScale,
- float powScale,
- float scaleTargets,
- float scaleOutput) {
- int numImages = outGrads.getNumCols();
- int imgPixels = outGrads.getNumRows() / numFilters;
-
- int imgSize = int(sqrt(imgPixels));
- assert(imgSize * imgSize == imgPixels);
-
- assert(outGrads.getNumRows() == numFilters * imgPixels);
-
- assert(denoms.isSameDims(outGrads));
- assert(acts.isSameDims(denoms));
- assert(!denoms.isTrans());
- assert(!outGrads.isTrans());
- assert(!acts.isTrans());
- assert(!target.isTrans());
- assert(outGrads.isContiguous());
-
- assert(numFilters % 16 == 0);
-
- target.resize(outGrads);
- assert(target.isContiguous());
- // First do acts := -2 x scale x acts x outGrads / denoms
- // so that the main routine only has to do an addition in its inner loop.
- int prelimEltsPerThread = 8;
- dim3 threads(128);
- dim3 blocks(
- DIVUP(outGrads.getNumElements(), (threads.x * prelimEltsPerThread)));
- bool checkPrelimBounds =
- outGrads.getNumElements() % (threads.x * prelimEltsPerThread) != 0;
- // printf("num elts: %d, blocks: %d\n", outGrads.getNumElements(), blocks.x);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- kRNormUndoPrelims<128, 8><<<blocks, threads, 0, stream>>>(
- acts.getDevData(),
- denoms.getTextureObject(),
- outGrads.getTextureObject(),
- outGrads.getNumElements(),
- -2 * addScale * powScale);
-
- // Now the main routine
- if (sizeX >= 6 && numFilters % 4 == 0) {
- // This one is faster for large regions (my tests show regions >= 6...)
- // NOTE: this stuff is not optimized for Kepler. Only kRNormUndo is.
- int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
- int filtersPerThread = 4;
- int bx = 16;
- bool checkCaseBounds = numImages % (bx * imgsPerThread) != 0;
- assert((imgsPerThread * bx) % 32 == 0);
-
- threads = dim3(bx, 16);
- blocks = dim3(
- DIVUP(imgSize, 4) * DIVUP(numImages, bx * imgsPerThread),
- DIVUP(imgSize, 4) * numFilters / filtersPerThread);
- if (imgsPerThread == 8) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 8, 4, true, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 8, 4, false, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 8, 4, true, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 8, 4, false, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 4, 4, true, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 4, 4, false, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 4, 4, true, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 4, 4, false, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (checkCaseBounds) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 2, 4, true, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 2, 4, false, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 2, 4, true, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1);
- kRNormUndo2<16, 2, 4, false, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else {
- int imgsPerThread = numImages % 128 == 0 ? 4 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- threads = dim3(32, 4);
- blocks = dim3(
- DIVUP(numImages, 32 * imgsPerThread) * imgSize,
- (numFilters / (4 * 2)) * imgSize);
-
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kRNormUndo<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
- kRNormUndo<4, 32, 4, 2, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
- kRNormUndo<4, 32, 4, 2, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kRNormUndo<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
- kRNormUndo<4, 32, 1, 2, true><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kRNormUndo<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
- kRNormUndo<4, 32, 1, 2, false><<<blocks, threads, 0, stream>>>(
- outGrads.getDevData(),
- denoms.getDevData(),
- inputs.getDevData(),
- acts.getDevData(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeX,
- powScale,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- getLastCudaError("kRNormUndo: kernel execution failed");
-}
-
-/*
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * target: (numChannels, tgtPixels, numImages)
- *
- * imgSize = scale * tgtSize
- */
-void convResizeBilinear(
- NVMatrix& images,
- NVMatrix& target,
- int imgSize,
- int tgtSize,
- float scale) {
- assert(!images.isTrans());
- assert(!target.isTrans());
- int imgPixels = imgSize * imgSize;
- int tgtPixels = tgtSize * tgtSize;
- int numChannels = images.getNumRows() / imgPixels;
- int numImages = images.getNumCols();
- assert(images.getNumRows() == numChannels * imgPixels);
-
- target.resize(numChannels * tgtPixels, numImages);
- assert(target.isContiguous());
- int numChunksX = DIVUP(tgtSize, 4);
- int numChunks = numChunksX * numChunksX;
- double imgCenter = imgSize * 0.5;
- double tgtCenter = tgtSize * 0.5;
- double centerScale = imgCenter - tgtCenter * scale;
-
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- cudaStream_t stream = NVMatrix::getDefaultStream();
- dim3 threads(32, 16);
- dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks);
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1);
- kResizeBilinear<4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- } else {
- cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1);
- kResizeBilinear<4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1);
- kResizeBilinear<2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- } else {
- cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1);
- kResizeBilinear<2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1);
- kResizeBilinear<1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- } else {
- cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1);
- kResizeBilinear<1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgSize,
- tgtSize,
- numImages,
- images.getStride(),
- scale,
- centerScale);
- }
- }
- getLastCudaError("convResizeBilinear: kernel execution failed");
-}
-
-/*
- * imgs: (3, imgPixels, numImages) with given imgStride
- * target: (3, imgPixels, numImages)
- */
-void convRGBToYUV(NVMatrix& images, NVMatrix& target) {
- assert(!images.isTrans());
- assert(!target.isTrans());
- int imgPixels = images.getNumRows() / 3;
- int numImages = images.getNumCols();
- assert(images.getNumRows() == 3 * imgPixels);
-
- target.resize(3 * imgPixels, numImages);
- assert(target.isContiguous());
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- cudaStream_t stream = NVMatrix::getDefaultStream();
- dim3 threads(32, 4);
- dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1);
- kRGBToYUV<4, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1);
- kRGBToYUV<4, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1);
- kRGBToYUV<2, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1);
- kRGBToYUV<2, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1);
- kRGBToYUV<1, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1);
- kRGBToYUV<1, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- }
- getLastCudaError("convRGBToYUV: kernel execution failed");
-}
-
-/*
- * imgs: (3, imgPixels, numImages) with given imgStride
- * target: (3, imgPixels, numImages)
- */
-void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) {
- assert(!images.isTrans());
- assert(!target.isTrans());
- int imgPixels = images.getNumRows() / 3;
- int numImages = images.getNumCols();
- assert(images.getNumRows() == 3 * imgPixels);
-
- target.resize(3 * imgPixels, numImages);
- assert(target.isContiguous());
-
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- dim3 threads(32, 4);
- dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (imgsPerThread == 4) {
- if (center) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1);
- kRGBToLAB<4, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<4, false, true>, cudaFuncCachePreferL1);
- kRGBToLAB<4, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kRGBToLAB<4, true, false>, cudaFuncCachePreferL1);
- kRGBToLAB<4, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<4, false, false>, cudaFuncCachePreferL1);
- kRGBToLAB<4, false, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- }
- } else if (imgsPerThread == 2) {
- if (center) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1);
- kRGBToLAB<2, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<2, false, true>, cudaFuncCachePreferL1);
- kRGBToLAB<2, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kRGBToLAB<2, true, false>, cudaFuncCachePreferL1);
- kRGBToLAB<2, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<2, false, false>, cudaFuncCachePreferL1);
- kRGBToLAB<2, false, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- }
- } else {
- if (center) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1);
- kRGBToLAB<1, true, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<1, false, true>, cudaFuncCachePreferL1);
- kRGBToLAB<1, false, true><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kRGBToLAB<1, true, false>, cudaFuncCachePreferL1);
- kRGBToLAB<1, true, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- } else {
- cudaFuncSetCacheConfig(
- kRGBToLAB<1, false, false>, cudaFuncCachePreferL1);
- kRGBToLAB<1, false, false><<<blocks, threads, 0, stream>>>(
- images.getDevData(),
- target.getDevData(),
- imgPixels,
- numImages,
- images.getStride());
- }
- }
- }
- getLastCudaError("convRGBToLAB: kernel execution failed");
-}
-
-/*
- * imgs: (numChannels, imgPixels, numImages) with given imgStride
- * target: (numChannels, tgtPixels, numImages)
- */
-void convCrop(
- NVMatrix& imgs,
- NVMatrix& target,
- int imgSize,
- int tgtSize,
- int startY,
- int startX) {
- int numImages = imgs.getNumCols();
- int imgPixels = imgSize * imgSize;
- int tgtPixels = tgtSize * tgtSize;
-
- int numChannels = imgs.getNumRows() / imgPixels;
- assert(imgs.getNumRows() == imgPixels * numChannels);
- assert(imgPixels == imgSize * imgSize);
- assert(imgSize - startY >= tgtSize);
- assert(imgSize - startX >= tgtSize);
- assert(startY >= 0);
- assert(startX >= 0);
- target.resize(numChannels * tgtPixels, numImages);
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- bool checkCaseBounds = numImages % (32 * imgsPerThread) != 0;
- dim3 blocks(
- DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4));
- dim3 threads(32, 4);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (imgsPerThread == 4) {
- if (checkCaseBounds) {
- kCrop<4, true><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- } else {
- kCrop<4, false><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- }
- } else if (imgsPerThread == 2) {
- if (checkCaseBounds) {
- kCrop<2, true><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- } else {
- kCrop<2, false><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- }
- } else {
- if (checkCaseBounds) {
- kCrop<1, true><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- } else {
- kCrop<1, false><<<blocks, threads, 0, stream>>>(
- imgs.getDevData(),
- target.getDevData(),
- numImages,
- imgs.getStride(),
- imgSize,
- tgtSize,
- startY,
- startX);
- }
- }
- getLastCudaError("convCrop: kernel execution failed");
-}
-
-/*
- * images: (numFilters, imgPixels, numImages)
- * meanDiffs: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
-
- * Note: at present, I have no code to compute the meanDiffs. So it should be
- set
- * to be equal to images. In other words, this isn't really doing contrast
- normalization,
- * just response normalization.
- */
-void convContrastNormCrossMap(
- NVMatrix& images,
- NVMatrix& meanDiffs,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked) {
- int numImages = images.getNumCols();
- int imgPixels = images.getNumRows() / numFilters;
- assert(images.getNumRows() == numFilters * imgPixels);
- int imgSize = int(sqrt(imgPixels));
- assert(imgSize * imgSize == imgPixels);
- assert(meanDiffs.isSameDims(images));
- assert(sizeF > 0 && sizeF <= numFilters);
-
- assert(!meanDiffs.isTrans());
- assert(!images.isTrans());
- assert(images.isContiguous());
- assert(meanDiffs.isContiguous());
- assert(numFilters % 16 == 0);
-
- target.resize(images);
- // denoms.resize(images);
- assert(target.isContiguous());
-
- bool checkCaseBounds = numImages % 128 != 0;
-
- dim3 threads(32, 4);
- dim3 blocks(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- // printf("convContrastNormCrossMap imgs: %p, meanDiffs: %p, denoms: %p,
- // target: %p, imgSize: %d, numFilters: %d, numImages: %d, sizeF: %d,
- // addScale: %f, powScale: %f, minDiv: %f, blocked: %d\n",
- // images.getDevData(), meanDiffs.getDevData(),
- // denoms.getDevData(), target.getDevData(), imgSize, numFilters,
- // numImages, sizeF, addScale, powScale, minDiv, blocked);
- if (blocked) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1);
- kFCNorm<4, 32, 4, true, true><<<blocks, threads, 0, stream>>>(
- images.getTextureObject(),
- meanDiffs.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1);
- kFCNorm<4, 32, 4, false, true><<<blocks, threads, 0, stream>>>(
- images.getTextureObject(),
- meanDiffs.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1);
- kFCNorm<4, 32, 4, true, false><<<blocks, threads, 0, stream>>>(
- images.getTextureObject(),
- meanDiffs.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv);
- } else {
- cudaFuncSetCacheConfig(
- kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1);
- kFCNorm<4, 32, 4, false, false><<<blocks, threads, 0, stream>>>(
- images.getTextureObject(),
- meanDiffs.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv);
- }
- }
-
- getLastCudaError("convContrastNormCrossMap: kernel execution failed");
-}
-
-/*
- * outGrads: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages)
- * inputs: (numFilters, imgPixels, numImages)
- * acts: (numFilters, imgPixels, numImages)
- * target: (numFilters, imgPixels, numImages)
- *
- * THIS WILL OVERWRITE THE ACTS MATRIX.
- */
-void convResponseNormCrossMapUndo(
- NVMatrix& outGrads,
- NVMatrix& inputs,
- NVMatrix& acts,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked,
- float scaleTargets,
- float scaleOutput) {
- int numImages = outGrads.getNumCols();
- int imgPixels = outGrads.getNumRows() / numFilters;
-
- int imgSize = int(sqrt(imgPixels));
- assert(imgSize * imgSize == imgPixels);
- assert(sizeF > 0 && sizeF <= numFilters);
- assert(outGrads.getNumRows() == numFilters * imgPixels);
-
- assert(!outGrads.isTrans());
- assert(!acts.isTrans());
- assert(!target.isTrans());
- assert(outGrads.isContiguous());
-
- assert(numFilters % 16 == 0);
-
- target.resize(outGrads);
- assert(target.isContiguous());
- // First do acts := -2 x scale x acts x outGrads / denoms
- // so that the main routine only has to do an addition in its inner loop.
- cudaStream_t stream = NVMatrix::getDefaultStream();
-
- dim3 threads2 = dim3(32, 4);
- dim3 blocks2 =
- dim3(DIVUP(numImages, 32 * 4) * imgSize, (numFilters / 4) * imgSize);
-
- bool checkCaseBounds = (numImages % 128) != 0;
- if (blocked) {
- if (scaleTargets == 0 && scaleOutput == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, false, true, true>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, false, true, true>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, false, false, true>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, false, false, true>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, true, true, true>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, true, true, true>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, true, false, true>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, true, false, true>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- }
- }
- } else {
- if (scaleTargets == 0 && scaleOutput == 1) {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, false, true, false>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, false, true, false>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, false, false, false>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, false, false, false>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- }
- } else {
- if (checkCaseBounds) {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, true, true, false>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, true, true, false>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- } else {
- cudaFuncSetCacheConfig(
- kFRNormUndo2<4, 32, 4, true, false, false>, cudaFuncCachePreferL1);
- kFRNormUndo2<4, 32, 4, true, false, false>
- <<<blocks2, threads2, 0, stream>>>(
- outGrads.getTextureObject(),
- inputs.getTextureObject(),
- acts.getTextureObject(),
- target.getDevData(),
- imgSize,
- numFilters,
- numImages,
- sizeF,
- addScale,
- powScale,
- minDiv,
- scaleTargets,
- scaleOutput);
- }
- }
- }
-
- getLastCudaError("convResponseNormCrossMapUndo: kernel execution failed");
-}
-
-void convResponseNormCrossMap(
- NVMatrix& images,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- float minDiv,
- bool blocked) {
- convContrastNormCrossMap(
- images,
- images,
- target,
- numFilters,
- sizeF,
- addScale,
- powScale,
- minDiv,
- blocked);
-}
-
-/*
- * images: (numFilters, imgPixels, numImages)
- * denoms: (numFilters, imgPixels, numImages) (out)
- * target: (numFilters, imgPixels, numImages) (out)
- */
-void convResponseNormCrossMap(
- NVMatrix& images,
- NVMatrix& target,
- int numFilters,
- int sizeF,
- float addScale,
- float powScale,
- bool blocked) {
- convContrastNormCrossMap(
- images,
- images,
- target,
- numFilters,
- sizeF,
- addScale,
- powScale,
- 1,
- blocked);
-}
-
-cudaTextureObject_t GetTensorTextureObject(caffe2::TensorCUDA* tensor) {
- cudaTextureObject_t tex_obj;
- cudaResourceDesc res_desc;
- std::memset(&res_desc, 0, sizeof(res_desc));
- res_desc.resType = cudaResourceTypeLinear;
- res_desc.res.linear.devPtr = tensor->mutable_data<float>();
- res_desc.res.linear.sizeInBytes = tensor->nbytes();
- res_desc.res.linear.desc =
- cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
- cudaTextureDesc tex_desc;
- std::memset(&tex_desc, 0, sizeof(tex_desc));
- CUDA_ENFORCE(
- cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
- return tex_obj;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../include/cudaconv2.cuh"
-
-__device__ __forceinline__ void
-filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
- int fPidx,
- int imgLoadModPosY,
- int imgLoadModPosX,
- int imgSizeX,
- int filterSize,
- int& iPidx) {
- int x = imgLoadModPosX + (fPidx) % filterSize;
- int y = imgLoadModPosY + (fPidx) / filterSize;
- iPidx =
- y >= 0 && y < imgSizeX && x >= 0 && x < imgSizeX ? y * imgSizeX + x : -1;
-}
-
-#define FA_COLOR3_IMPRELOAD(c, i) \
- imPreload[c][i] = \
- iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \
- ? 0 \
- : mm[c * imgPixels * imgStride + i * B_X];
-#define FA_COLOR3_IMPRELOAD_TX(c, i) \
- imPreload[c][i] = \
- iPidxNext < 0 || (checkImgBounds && myImgIdx + i * B_X >= numImages) \
- ? 0 \
- : tex1Dfetch<float>( \
- images, imagesOffset2 + c * imgPixels * imgStride + i * B_X);
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int numColors,
- int pixelCache,
- bool scale,
- bool checkImgBounds>
-//__launch_bounds__(128,3)
-__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex(
- cudaTextureObject_t images,
- cudaTextureObject_t filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv /*, const bool noloads*/) {
- __shared__ float
- shFilters[numColors][pixelCache]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float shImages[numColors][pixelCache]
- [B_X * imgsPerThread]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
-
- const int numModules = numModulesX * numModulesY;
- // Another fun insanity: the % B_X makes things faster, even thought
- // threadIdx.x is in the range 0..31. It appears that this allows the compiler
- // to optimize?
- const int tx = threadIdx.x % B_X;
- const int ty = threadIdx.y % B_Y;
- const int tidx = ty * B_X + threadIdx.x;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
-
- // images += myImgIdx;
- // filters += blockFilterIdx
- // + shFilterLoadY * numFilters + shFilterLoadX;
- // if (!conv) { // NOTE: UNTESTED!
- // filters += moduleIdx * numColors * filterPixels * numFilters;
- // }
-
- const int imagesOffset = myImgIdx;
- const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters +
- shFilterLoadX +
- (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters);
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
- numModules +
- myImgIdx;
-
- float prod[imgsPerThread][filtersPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] = 0;
- }
- }
-
- int iPidxNext;
- float imPreload[numColors][imgsPerThread];
- float fPreload[numColors][pixelCache * filtersPerThread / B_X];
-
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int p = 0; p < pixelCache; p += B_X / filtersPerThread) {
- if (p + shFilterLoadY < filterPixels) {
- fPreload[c][p * filtersPerThread / B_X] = tex1Dfetch<float>(
- filters,
- filtersOffset + p * numFilters + c * numFilters * filterPixels);
- } else {
- fPreload[c][p * filtersPerThread / B_X] = 0;
- }
- }
- }
-
- filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
- ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext);
-
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (iPidxNext >= 0 &&
- (!checkImgBounds || myImgIdx + i * B_X < numImages)) {
- imPreload[c][i] = tex1Dfetch<float>(
- images,
- imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X);
- } else {
- imPreload[c][i] = 0;
- }
- }
- }
-
- for (int p = 0; p < filterPixels; p += pixelCache) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
- // NOTE: bank conflicts here!
- shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i];
- }
- }
-
- const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache;
- filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
- fPidxNext + ty,
- imgLoadModPosY,
- imgLoadModPosX,
- imgSizeX,
- filterSize,
- iPidxNext);
-
- // const float* ff = &filters[numFilters * fPidxNext];
- // const float* mm = &images[imgStride * iPidxNext];
- const int filtersOffset2 = filtersOffset + numFilters * fPidxNext;
- const int imagesOffset2 = imagesOffset + imgStride * iPidxNext;
-
- FA_COLOR3_IMPRELOAD_TX(0, 0);
- FA_COLOR3_IMPRELOAD_TX(0, 1);
- FA_COLOR3_IMPRELOAD_TX(0, 2);
- FA_COLOR3_IMPRELOAD_TX(0, 3);
-
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int pp = 0; pp < pixelCache; pp += B_X / filtersPerThread) {
- shFilters[c][pp + shFilterLoadY][shFilterLoadX] =
- fPreload[c][pp * filtersPerThread / B_X];
- }
- }
-
- __syncthreads();
- FA_COLOR3_IMPRELOAD_TX(1, 0);
- FA_COLOR3_IMPRELOAD_TX(1, 1);
- FA_COLOR3_IMPRELOAD_TX(1, 2);
- FA_COLOR3_IMPRELOAD_TX(1, 3);
- FA_COLOR3_IMPRELOAD_TX(2, 0);
- FA_COLOR3_IMPRELOAD_TX(2, 1);
- FA_COLOR3_IMPRELOAD_TX(2, 2);
- FA_COLOR3_IMPRELOAD_TX(2, 3);
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int pp = 0; pp < pixelCache * filtersPerThread / B_X; pp++) {
- fPreload[c][pp] =
- fPidxNext + pp * (B_X / filtersPerThread) + shFilterLoadY >=
- filterPixels
- ? 0
- : tex1Dfetch<float>(
- filters,
- filtersOffset2 + c * numFilters * filterPixels +
- pp * (B_X / filtersPerThread) * numFilters);
- }
- }
-#pragma unroll
- for (int pp = 0; pp < pixelCache; pp++) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] *
- shFilters[c][pp][ty * filtersPerThread + f];
- }
- }
- }
- }
-
- __syncthreads();
- }
-
- if (scale) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleTargets * targets[i * B_X + f * numImages * numModules] +
- scaleOutputs * prod[i][f];
- }
- }
- }
- } else {
-// Note: reversing order of these loops saves 2 registers, but costs time
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleOutputs * prod[i][f];
- }
- }
- }
- }
-}
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- * This won't be pretty.
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int numColors,
- int pixelCache,
- bool scale,
- bool checkImgBounds>
-__global__ void filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex(
- cudaTextureObject_t images,
- cudaTextureObject_t filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv /*, const bool noloads*/) {
- __shared__ float
- shFilters[numColors][pixelCache]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float shImages[numColors][pixelCache]
- [B_X * imgsPerThread]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
-
- const int numModules = numModulesX * numModulesY;
- // Another fun insanity: the % B_X makes things faster, even though
- // threadIdx.x is in the range 0..31. It appears that this allows the compiler
- // to optimize?
- const int tx = threadIdx.x % B_X;
- const int ty = threadIdx.y % B_Y;
- const int tidx = ty * B_X + threadIdx.x;
- const int warp = tidx / 32;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
-
- // images += myImgIdx;
- // filters += blockFilterIdx
- // + shFilterLoadY * numFilters + shFilterLoadX;
- // if (!conv) { // NOTE: UNTESTED!
- // filters += moduleIdx * numColors * filterPixels * numFilters;
- // }
-
- const int imagesOffset = myImgIdx;
- const int filtersOffset = blockFilterIdx + shFilterLoadY * numFilters +
- shFilterLoadX +
- (conv ? 0 : moduleIdx * numColors * filterPixels * numFilters);
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
- numModules +
- myImgIdx;
-
- float prod[imgsPerThread][filtersPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] = 0;
- }
- }
-
- int iPidxNext;
- float imPreload[numColors][imgsPerThread];
- float fPreload[numColors][DIVUP(pixelCache * filtersPerThread, B_X)];
-
- if (warp < 3) {
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int p = 0; p < pixelCache; p += 2) {
- if (p + shFilterLoadY < filterPixels) {
- fPreload[c][p / 2] = tex1Dfetch<float>(
- filters,
- filtersOffset + p * numFilters + c * numFilters * filterPixels);
- } else {
- fPreload[c][p / 2] = 0;
- }
- }
- }
- }
-
- filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
- ty, imgLoadModPosY, imgLoadModPosX, imgSizeX, filterSize, iPidxNext);
-
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (iPidxNext >= 0 &&
- (!checkImgBounds || myImgIdx + i * B_X < numImages)) {
- imPreload[c][i] = tex1Dfetch<float>(
- images,
- imagesOffset + (c * imgPixels + iPidxNext) * imgStride + i * B_X);
- } else {
- imPreload[c][i] = 0;
- }
- }
- }
-
- for (int p = 0; p < filterPixels; p += pixelCache) {
- const int fPidxNext = p + pixelCache >= filterPixels ? 0 : p + pixelCache;
- filterActs_YxX_color_preload_ty_4_tx_32_f_16_cc_3_setImgCoords(
- fPidxNext + ty,
- imgLoadModPosY,
- imgLoadModPosX,
- imgSizeX,
- filterSize,
- iPidxNext);
-
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- // NOTE: bank conflicts here!
- shImages[c][ty][tx * imgsPerThread + i] = imPreload[c][i];
- }
- }
-
- if (warp < 3) {
-#pragma unroll
- for (int c = 0; c < numColors; ++c) {
-#pragma unroll
- for (int pp = 0; pp < pixelCache; pp += 2) {
- shFilters[c][pp + shFilterLoadY][shFilterLoadX] = fPreload[c][pp / 2];
- }
- }
- }
-
- __syncthreads();
- // const float* ff = &filters[numFilters * fPidxNext];
- // const float* mm = &images[imgStride * iPidxNext];
- const int filtersOffset2 = filtersOffset + numFilters * fPidxNext;
- const int imagesOffset2 = imagesOffset + imgStride * iPidxNext;
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; ++i) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- FA_COLOR3_IMPRELOAD_TX(c, i);
- }
- }
-
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int pp = 0; pp < 2; pp++) {
- fPreload[c][pp] =
- warp >= 3 || fPidxNext + pp * 2 + shFilterLoadY >= filterPixels
- ? 0
- : tex1Dfetch<float>(
- filters,
- filtersOffset2 + c * numFilters * filterPixels +
- pp * 2 * numFilters);
- }
-#pragma unroll
- for (int pp = 0; pp < pixelCache; pp++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] += shImages[c][pp][tx * imgsPerThread + i] *
- shFilters[c][pp][ty * filtersPerThread + f];
- }
- }
- }
- }
- __syncthreads();
- }
-
- if (scale) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleTargets * targets[i * B_X + f * numImages * numModules] +
- scaleOutputs * prod[i][f];
- }
- }
- }
- } else {
-// Note: reversing order of these loops costs 2 registers, but saves time
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleOutputs * prod[i][f];
- }
- }
- }
- }
-}
-
-__device__ inline void
-filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
- int filterSize,
- int imgSizeX,
- int imgLoadModPosY,
- int imgLoadModPosX,
- int imgY,
- int imgX,
- int& fPidx,
- int& iPidx) {
- int filterPxY = imgY - imgLoadModPosY;
- int filterPxX = imgX - imgLoadModPosX;
- fPidx = filterPxY * filterSize + filterPxX;
- iPidx = imgY * imgSizeX + imgX; // Pixel index in img
-}
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- * Note: in git there's a 1.5% faster version of this which sues 167 registers
- * instead of 154... it's basically the same thing, but it doesn't do the
- * next-pixel computation. It just avoids pre-loading when it rolls over to the
- * next pixel.
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int colorCache,
- bool scale,
- bool checkImgBounds>
-__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(
- float* images,
- float* filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv /*, const bool noloads*/) {
- __shared__ float
- shFilters[colorCache]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float shImages[colorCache]
- [B_X * imgsPerThread]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int numFilterColors = numImgColors / numGroups;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
-
- const int numModules = numModulesX * numModulesY;
- const int blockColorIdx = numFilterColors * blockGroupIdx;
- // Another fun insanity: the % B_X makes things faster, even thought
- // threadIdx.x is in the range 0..31. It appears that this allows the compiler
- // to optimize?
- const int tx = threadIdx.x % B_X;
- const int ty = threadIdx.y % B_Y;
- const int tidx = ty * B_X + threadIdx.x;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
-
- images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
- filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
- shFilterLoadX;
- if (!conv) {
- filters += moduleIdx * numFilterColors * filterPixels * numFilters;
- }
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
- numModules +
- myImgIdx;
-
- float prod[imgsPerThread][filtersPerThread];
- // float fCache[filtersPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] = 0;
- }
- }
- // NOTE: these max/min functions increase register usage as compared to my
- // macros
- const int imgStartX = max(0, imgLoadModPosX);
- const int imgStartY = max(0, imgLoadModPosY);
- const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX);
- const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY);
- // __shared__ int imgPos[]
-
- int fPidx, iPidx;
- float imPreload[imgsPerThread];
- float fPreload[colorCache * filtersPerThread / B_X];
- // float fCache[filtersPerThread];
-
- filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
- filterSize,
- imgSizeX,
- imgLoadModPosY,
- imgLoadModPosX,
- imgStartY,
- imgStartX,
- fPidx,
- iPidx);
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- imPreload[i] = images[imgStride * iPidx + i * B_X];
- } else {
- imPreload[i] = 0;
- }
- }
- if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY <
- B_X / filtersPerThread) { // This if statement reduces reg usage..
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- fPreload[c * filtersPerThread / B_X] =
- filters[(c * filterPixels + fPidx) * numFilters];
- }
- }
- for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
- // const int filterPxY = imgY - imgLoadModPosY;
- for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
- // const int filterPxX = imgX - imgLoadModPosX;
- // const int p = filterPxY * filterSize + filterPxX;
- // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in
- // img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY,
- // imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m =
- // &images[imgStride * pixIdx];
- const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1;
- int imgYNext = imgY;
- int imgXNext = imgX;
- int fPidxNext, iPidxNext;
- if (!lastPixel) {
- imgYNext = imgY + (imgX + 1 == imgEndX);
- imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1;
- }
- filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
- filterSize,
- imgSizeX,
- imgLoadModPosY,
- imgLoadModPosX,
- imgYNext,
- imgXNext,
- fPidxNext,
- iPidxNext);
- for (int oc = 0; oc < numFilterColors;
- oc += colorCache) { // oc stands for outer color (loop)
- const float* ff =
- &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)];
- const float* mm =
- &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)];
- if (oc == numFilterColors - colorCache) {
- ff = &filters[fPidxNext * numFilters];
- mm = &images[iPidxNext * imgStride];
- fPidx = fPidxNext;
- iPidx = iPidxNext;
- }
-
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- shFilters[c + shFilterLoadY][shFilterLoadX] =
- fPreload[c * filtersPerThread / B_X];
- }
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- // NOTE: bank conflicts here!
- shImages[ty][tx * imgsPerThread + i] = imPreload[i];
- }
- imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages)
- ? 0
- : mm[0 * B_X];
- imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages)
- ? 0
- : mm[1 * B_X];
- imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages)
- ? 0
- : mm[2 * B_X];
-
- __syncthreads();
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] *
- shFilters[0][threadIdx.y * filtersPerThread + f];
- }
- }
-
- fPreload[0] = ff[0];
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] *
- shFilters[1][threadIdx.y * filtersPerThread + f];
- }
- }
-
- fPreload[1] = ff[(B_X / filtersPerThread * filterPixels) * numFilters];
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] *
- shFilters[2][threadIdx.y * filtersPerThread + f];
- }
- }
-
- imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages)
- ? 0
- : mm[3 * B_X];
-
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] *
- shFilters[3][threadIdx.y * filtersPerThread + f];
- }
- }
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleTargets * targets[i * B_X + f * numImages * numModules] +
- scaleOutputs * prod[i][f];
- }
- }
- }
- } else {
-// Note: reversing order of these loops saves 2 registers, but costs time
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- targets[i * B_X + f * numImages * numModules] =
- scaleOutputs * prod[i][f];
- }
- }
- }
- }
-}
-
-/*****************************Function Revision
- *Record***************************** Author: Tencent BestImage
- *Team(ankerguo@tencent.com) * Date: 2015-05-18 *
- * Reason: Optimizing kernel to get faster speed according to GPU features *
- * Method: *
- * 1. reorganizing data structure to avoid bank conflict; *
- * 2. using vectorized data type; *
- * 3. improving instruction-level parallelism; *
- * 4. removing redundant 'if' branches; *
- * 5. removing local variables to save registers. *
- *********************************************************************************/
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int colorCache,
- bool scale,
- bool checkImgBounds>
-__global__ void __launch_bounds__(128, 4)
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(
- cudaTextureObject_t images,
- cudaTextureObject_t filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv /*, const bool noloads*/) {
- // avoid bank conflict by reorganizing the data structure and improve the band
- // width by using 'float2' instead of 'float'
- __shared__ float2
- shFilters[colorCache / 2]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float2
- shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int numFilterColors = numImgColors / numGroups;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
-
- const int numModules = numModulesX * numModulesY;
- const int blockColorIdx = numFilterColors * blockGroupIdx;
- // Another fun insanity: the % B_X makes things faster, even thought
- // threadIdx.x is in the range 0..31. It appears that this allows the compiler
- // to optimize?
- const int tx = threadIdx.x % B_X;
- const int ty = threadIdx.y % B_Y;
- // const int tidx = ty * B_X + threadIdx.x; // reduce one register
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- // reduce two registers
- // const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- // const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx;
- const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx;
-
- // images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride +
- // myImgIdx;
- const int filterOffset = blockFilterIdx +
- ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels +
- ((ty * B_X + tx) % (B_Y * filtersPerThread)) +
- (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters);
- // filters +=blockFilterIdx
- // + shFilterLoadY * numFilters * filterPixels + shFilterLoadX;
- // if (!conv) {
- // filters += moduleIdx * numFilterColors * filterPixels * numFilters;
- // }
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages *
- numModules +
- myImgIdx;
-
- // combine two registers into one
- const int numModImages = numModules * numImages;
- float prod[imgsPerThread][filtersPerThread];
- // float fCache[filtersPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[i][f] = 0;
- }
- }
- // NOTE: these max/min functions increase register usage as compared to my
- // macros
- const int imgStartX = max(0, imgLoadModPosX);
- const int imgStartY = max(0, imgLoadModPosY);
- const int imgEndX = min(imgLoadModPosX + filterSize, imgSizeX);
- const int imgEndY = min(imgLoadModPosY + filterSize, imgSizeY);
- // __shared__ int imgPos[]
-
- int fPidx, iPidx;
- float imPreload[imgsPerThread]; // [4]
- float fPreload[colorCache * filtersPerThread / B_X]; // [2]
- // float fCache[filtersPerThread];
-
- filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
- filterSize,
- imgSizeX,
- imgLoadModPosY,
- imgLoadModPosX,
- imgStartY,
- imgStartX,
- fPidx,
- iPidx);
-
-// remove redundant conditions
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- imPreload[i] =
- tex1Dfetch<float>(images, imgOffset + imgStride * iPidx + i * B_X);
- }
-
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- fPreload[c * filtersPerThread / B_X] = tex1Dfetch<float>(
- filters, filterOffset + (c * filterPixels + fPidx) * numFilters);
- }
- for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
- // const int filterPxY = imgY - imgLoadModPosY;
- for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
- // const int filterPxX = imgX - imgLoadModPosX;
- // const int p = filterPxY * filterSize + filterPxX;
- // const int pixIdx = imgY * imgSizeX + imgX;// Pixel index in
- // img setPixelCoords(filterSize, imgSizeX, imgLoadModPosY,
- // imgLoadModPosX, imgY, imgX, &p, &pixIdx); float* m =
- // &images[imgStride * pixIdx];
- const bool lastPixel = imgY == imgEndY - 1 && imgX == imgEndX - 1;
- int imgYNext = imgY;
- int imgXNext = imgX;
- int fPidxNext, iPidxNext;
- if (!lastPixel) {
- imgYNext = imgY + (imgX + 1 == imgEndX);
- imgXNext = imgX + 1 == imgEndX ? imgStartX : imgX + 1;
- }
- filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(
- filterSize,
- imgSizeX,
- imgLoadModPosY,
- imgLoadModPosX,
- imgYNext,
- imgXNext,
- fPidxNext,
- iPidxNext);
- for (int oc = 0; oc < numFilterColors;
- oc += colorCache) { // oc stands for outer color (loop)
- // store the preloaded pixel of filter and image into shared memory
- shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)]
- [(ty * B_X + tx) % (B_Y * filtersPerThread)]
- .x = fPreload[0];
- shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)]
- [(ty * B_X + tx) % (B_Y * filtersPerThread)]
- .y = fPreload[1];
- shImages[ty][tx].x = imPreload[0];
- shImages[ty][tx].y = imPreload[1];
- shImages[ty][tx + B_X].x = imPreload[2];
- shImages[ty][tx + B_X].y = imPreload[3];
-
- int imgOffset2 =
- imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx);
- int filterOffset2 = filterOffset +
- numFilters * ((oc + colorCache) * filterPixels + fPidx);
- if (oc == numFilterColors - colorCache) {
- filterOffset2 = filterOffset + fPidxNext * numFilters;
- imgOffset2 = imgOffset + iPidxNext * imgStride;
- fPidx = fPidxNext;
- iPidx = iPidxNext;
- }
-
- // preload one pixel of filter and image from texture, and no need to
- // check 'checkImgBounds' with all callers setting it as false
- imPreload[0] = tex1Dfetch<float>(images, imgOffset2);
- imPreload[1] = tex1Dfetch<float>(images, imgOffset2 + B_X);
- imPreload[2] = tex1Dfetch<float>(images, imgOffset2 + 2 * B_X);
- imPreload[3] = tex1Dfetch<float>(images, imgOffset2 + 3 * B_X);
- fPreload[0] = tex1Dfetch<float>(filters, filterOffset2);
- fPreload[1] = tex1Dfetch<float>(
- filters, filterOffset2 + 2 * filterPixels * numFilters);
-
- __syncthreads();
-
-// put together the instructions with same type to improve instruction-level
-// parallelism calculate the convolution between images and filters
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int r = 0; r < colorCache / 2; r++) {
- prod[0][f] +=
- shImages[r][tx].x * shFilters[r][ty * filtersPerThread + f].x;
- prod[1][f] +=
- shImages[r][tx].y * shFilters[r][ty * filtersPerThread + f].x;
- prod[2][f] += shImages[r][tx + B_X].x *
- shFilters[r][ty * filtersPerThread + f].x;
- prod[3][f] += shImages[r][tx + B_X].y *
- shFilters[r][ty * filtersPerThread + f].x;
- prod[0][f] += shImages[r + 2][tx].x *
- shFilters[r][ty * filtersPerThread + f].y;
- prod[1][f] += shImages[r + 2][tx].y *
- shFilters[r][ty * filtersPerThread + f].y;
- prod[2][f] += shImages[r + 2][tx + B_X].x *
- shFilters[r][ty * filtersPerThread + f].y;
- prod[3][f] += shImages[r + 2][tx + B_X].y *
- shFilters[r][ty * filtersPerThread + f].y;
- }
- }
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- // remove the redundant condition for less registers
- targets[i * B_X + f * numModImages] =
- scaleTargets * targets[i * B_X + f * numModImages] +
- scaleOutputs * prod[i][f];
- }
- }
- } else {
-// Note: reversing order of these loops saves 2 registers, but costs time
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- // remove the redundant condition for less registers
- targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f];
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X
- * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines
- * filter
- *
- * blockIdx.x determines image batch of B_X * imgsPerThread
- * blockIdx.y determines filter batch of module and B_Y * filtersPerThread
- *
- * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numColors, filterPixels, numFilters) if conv
- * (numModules, numColors, filterPixels, numFilters) otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- *
- * Number of filters per module should be divisible by B_Y * filtersPerThread
- * checkImgBounds indicates whether number of images is divisible by B_X *
- * imgsPerThread
- *
- * The imgSize here is the size of the actual image without the padding.
- *
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int numColors,
- int pixelCache,
- bool scale,
- bool checkImgBounds>
-__global__ void filterActs_YxX_color(
- float* images,
- float* filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv) {
- __shared__ float
- shFilters[pixelCache * numColors]
- [B_Y * filtersPerThread]; // pre-load pixelCache pixels from
- // B_Y*filtersPerThread filters
- __shared__ float
- shImages[pixelCache * numColors]
- [B_X * imgsPerThread]; // pre-load pixelCache pixels from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
-
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx = blockIdx.y % blocksPerModule;
-
- const int tidx = threadIdx.y * B_X + threadIdx.x;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
- const int numModules = numModulesY * numModulesX;
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
- images += myImgIdx;
- filters += filtersPerThread * B_Y * blockFilterIdx +
- shFilterLoadY * numFilters + shFilterLoadX;
- if (!conv) {
- filters += moduleIdx * numColors * filterPixels * numFilters;
- }
-
- targets += moduleIdx * numImages +
- (blockFilterIdx * B_Y * filtersPerThread +
- threadIdx.y * filtersPerThread) *
- numImages * numModulesY * numModulesX +
- myImgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- prod[f][g] = 0;
- }
- }
- // float* shImgLoad = &shImages[0][threadIdx.x];
- for (int p = 0; p < filterPixels; p += pixelCache) {
- /*
- * Load pixelCache pixels from B_Y*filtersPerThread filters
- * This condition covers the case when B_X is not divisible by
- * filtersPerThread. In this case, not all of the threads will participate
- * in the loading operation. This ensures that in each loop iteration, an
- * integer number of rows of shFilters are filled, which makes indexing
- * simple.
- */
- if (B_X % filtersPerThread == 0 || shFilterLoadY < B_X / filtersPerThread) {
-#pragma unroll
- for (int p2 = 0; p2 < pixelCache; p2 += B_X / filtersPerThread) {
- const bool omit = pixelCache % (B_X / filtersPerThread) == 0;
- const int preloadPx = shFilterLoadY + p2;
- if (omit || preloadPx < pixelCache) {
- if (p + preloadPx < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] =
- filters[(c * filterPixels + p + p2) * numFilters];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shFilters[shFilterLoadY + p2 + c * pixelCache][shFilterLoadX] = 0;
- }
- }
- }
- }
- }
-
-/*
- * Load pixelCache pixels from B_X*imgsPerThread images.
- */
-#pragma unroll
- for (int ly = 0; ly < pixelCache; ly += B_Y) {
- const int preloadPx = ly + threadIdx.y;
- const int pixIdx = p + preloadPx;
- const bool omit = pixelCache % B_Y == 0; // Compile-time condition
- /*
- * Don't load any image pixels corresponding to filter pixels that don't
- * exist.
- */
- if (pixIdx < filterPixels && (omit || preloadPx < pixelCache)) {
- const int x = imgLoadModPosX + pixIdx % filterSize;
- const int y = imgLoadModPosY + pixIdx / filterSize;
-
- if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
- float* m = &images[imgStride * (y * imgSizeX + x)];
-
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- shImages[preloadPx + c * pixelCache]
- [threadIdx.x * imgsPerThread + i] =
- m[c * imgStride * imgPixels + i * B_X];
- } else {
- shImages[preloadPx + c * pixelCache]
- [threadIdx.x * imgsPerThread + i] = 0;
- }
- }
- }
- } else { // Padding
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[preloadPx + c * pixelCache]
- [threadIdx.x * imgsPerThread + i] = 0;
- }
- }
- }
- }
- }
-
- __syncthreads();
-
-#pragma unroll
- for (int i = 0; i < pixelCache * numColors; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- prod[f][g] += shImages[i][g + threadIdx.x * imgsPerThread] *
- shFilters[i][threadIdx.y * filtersPerThread + f];
- }
- }
- }
- __syncthreads();
- }
-
- if (scale) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
- targets[g * B_X + f * numImages * numModules] =
- scaleTargets * targets[g * B_X + f * numImages * numModules] +
- scaleOutputs * prod[f][g];
- }
- }
- }
- } else {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[g * B_X + f * numImages * numModules] =
- scaleOutputs * prod[f][g];
- }
- }
- }
- }
-}
-
-/*
- * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X
- * * imgsPerThread images. threadIdx.x determines image threadIdx.y determines
- * filter
- *
- * blockIdx.x determines image batch of B_X * imgsPerThread
- * blockIdx.y determines filter batch of B_Y * filtersPerThread
- *
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModulesY, numModulesX, numImages)
- *
- * B_Y one of 4, 8, 16
- * B_X one of 16, 32
- * imgsPerThread one of 1, 2, 4
- * filtersPerThread one of 1, 2, 4, 8
- * colorCache: how many colors to put into shmem
- *
- * numFilters should be divisible by B_Y * filtersPerThread
- * numImages be divisible by B_X * imgsPerThread
- * numFilterColors should be divisible by colorCache.
- * numImgColors must be even.
- * numFilters must be divisible by numGroups.
- * no restrictions on pixelCache
- * The imgSize here is the size of the actual image without the padding.
- * As always, try to make B_X * imgsPerThread == B_Y * filtersPerThread for
- * maximum efficiency.
- *
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int colorCache,
- bool scale,
- bool checkImgBounds>
-__global__ void filterActs_YxX_sparse2(
- float* images,
- float* filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv) {
- __shared__ float
- shFilters[colorCache]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float shImages[colorCache]
- [B_X * imgsPerThread]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int numFilterColors = numImgColors / numGroups;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
-
- const int numModules = numModulesX * numModulesY;
- const int blockColorIdx = numFilterColors * blockGroupIdx;
-
- const int tidx = threadIdx.y * B_X + threadIdx.x;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
-
- images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
- filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
- shFilterLoadX;
- if (!conv) {
- filters += moduleIdx * numFilterColors * filterPixels * numFilters;
- }
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + threadIdx.y) * numImages * numModules + myImgIdx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- prod[f][g] = 0;
- }
- }
- const int imgStartX = MAX(0, imgLoadModPosX);
- const int imgStartY = MAX(0, imgLoadModPosY);
- const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX);
- const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY);
- // __shared__ int imgPos[]
-
- for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
- const int filterPxY = imgY - imgLoadModPosY;
- for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
- const int filterPxX = imgX - imgLoadModPosX;
- const int p = filterPxY * filterSize + filterPxX;
- for (int oc = 0; oc < numFilterColors;
- oc += colorCache) { // oc stands for outer color (loop)
-
- /*
- * Load a pixel from B_Y*filtersPerThread filters
- * This condition covers the case when B_X is not divisible by
- filtersPerThread.
- * In this case, not all of the threads will participate in the loading
- operation.
- * This ensures that in each loop iteration, an integer number of rows
- of shFilters
- * are filled, which makes indexing simple.
-
- * nvcc is behaving in a completely insane way: removing this condition
- under
- * template parameters that guarantee it to be true actually slows down
- * the computation.
- *
- */
- if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY <
- B_X / filtersPerThread) {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- if (colorCache % (B_X / filtersPerThread) == 0 ||
- c + shFilterLoadY < colorCache) {
- shFilters[c + shFilterLoadY][shFilterLoadX] =
- filters[((oc + c) * filterPixels + p) * numFilters];
- }
- }
- }
-
- /*
- * Load a pixel from B_X*imgsPerThread images.
- */
- const int pixIdx = imgY * imgSizeX + imgX; // Pixel index in img
-
- float* m = &images[imgStride * (oc * imgPixels + pixIdx)];
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y) {
- if (colorCache % B_Y == 0 || threadIdx.y + c < colorCache) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
- shImages[c + threadIdx.y][threadIdx.x + i * B_X] =
- m[c * imgStride * imgPixels + i * B_X];
- } else {
- shImages[c + threadIdx.y][threadIdx.x + i * B_X] = 0;
- }
- }
- }
- }
-
- __syncthreads();
-
- for (int c = 0; c < colorCache; c++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][g] += shImages[c][g * B_X + threadIdx.x] *
- shFilters[c][threadIdx.y + f * B_Y];
- }
- }
- }
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets *
- targets[g * B_X + f * B_Y * numImages * numModules] +
- scaleOutputs * prod[f][g];
- }
- }
- }
- } else {
-// Note: reversing order of these loops saves 2 registers, but costs time
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
- targets[g * B_X + f * B_Y * numImages * numModules] =
- scaleOutputs * prod[f][g];
- }
- }
- }
- }
-}
-
-/*****************************Function Revision
- *Record***************************** Author: Tencent BestImage
- *Team(ankerguo@tencent.com) * Date: 2015-05-18 *
- * Reason: Optimizing kernel to get faster speed according to GPU features *
- * Method: *
- * 1. reorganizing data structure to avoid bank conflict; *
- * 2. using vectorized data type; * Note: This function can be used
- *when each thread loads even number of filter * pixels(filtersPerThread *
- *colorCache / B_X is even), and this can be * optimized more when the number
- *of loaded image's pixel is even. *
- *********************************************************************************/
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int filtersPerThread,
- int colorCache,
- bool scale,
- bool checkImgBounds>
-__global__ void filterActs_YxX_sparse2_f_vec(
- float* images,
- float* filters,
- float* targets,
- const int numImages,
- const int numFilters,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int numModulesY,
- const int numModulesX,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs,
- const bool conv) {
- // improve shared memory's band width by using 'float2' instead of 'float'
- __shared__ float2
- shFilters[colorCache / 2]
- [B_Y * filtersPerThread]; // pre-load 1 pixel from
- // B_Y*filtersPerThread filters
- __shared__ float shImages[colorCache]
- [B_X * imgsPerThread]; // pre-load 1 pixel from
- // B_X*imgsPerThread images
-
- const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
- const int imgPixels = imgSizeY * imgSizeX;
- const int filterPixels = filterSize * filterSize;
- const int numFilterColors = numImgColors / numGroups;
- const int blocksPerModule = numFilters / (B_Y * filtersPerThread);
- const int moduleIdx = blockIdx.y / blocksPerModule;
- const int blockFilterIdx =
- filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
-
- const int numModules = numModulesX * numModulesY;
- const int blockColorIdx = numFilterColors * blockGroupIdx;
-
- const int tidx = ty * B_X + tx;
-
- const int imgLoadModPosY =
- paddingStart + (moduleIdx / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (moduleIdx % numModulesX) * moduleStride;
-
- // load position of filters' pixels for current thread
- const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
- const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
- // load position of images' pixels for current thread
- const int shImgLoadY = tidx / (B_X * imgsPerThread);
- const int shImgLoadX = tidx % (B_X * imgsPerThread);
-
- const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX;
- images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx;
-
- filters += blockFilterIdx + shFilterLoadY * numFilters * filterPixels +
- shFilterLoadX;
- if (!conv) {
- filters += moduleIdx * numFilterColors * filterPixels * numFilters;
- }
-
- targets += moduleIdx * numImages +
- (blockFilterIdx + ty) * numImages * numModules +
- blockIdx.x * B_X * imgsPerThread + tx;
-
- float prod[filtersPerThread][imgsPerThread];
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- prod[f][g] = 0;
- }
- }
-
- const int imgStartX = MAX(0, imgLoadModPosX);
- const int imgStartY = MAX(0, imgLoadModPosY);
- const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX);
- const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY);
-
- // temporary buffer to store the filter's loaded pixels during each loop
- float fPreload[colorCache * filtersPerThread / B_X];
- // temporary buffer to store the image's loaded pixels during each loop
- float iPreload[colorCache * imgsPerThread / B_Y];
-
-// preload filter's pixels
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- fPreload[c * filtersPerThread / B_X] = filters
- [(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize +
- (imgStartX - imgLoadModPosX)) *
- numFilters];
- }
-
- // preload image's pixels
- if (!checkImgBounds || myImgIdx < numImages) {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = images
- [(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = 0;
- }
- }
-
- for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
- // const int filterPxY = imgY - imgLoadModPosY;
- for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
- for (int oc = 0; oc < numFilterColors;
- oc += colorCache) { // oc stands for outer color (loop)
-// store the preloaded filter's pixels into shared memory
-#pragma unroll
- for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) {
- shFilters[c + shFilterLoadY][shFilterLoadX].x =
- fPreload[c * filtersPerThread / B_X];
- shFilters[c + shFilterLoadY][shFilterLoadX].y =
- fPreload[(c + colorCache / 2) * filtersPerThread / B_X];
- }
-
-// store the preloaded image's pixels into shared memory
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- shImages[c + shImgLoadY][shImgLoadX] =
- iPreload[c * imgsPerThread / B_Y];
- }
- /*
- * Load a pixel from B_Y*filtersPerThread filters
- * This condition covers the case when B_X is not divisible by
- filtersPerThread.
- * In this case, not all of the threads will participate in the loading
- operation.
- * This ensures that in each loop iteration, an integer number of rows
- of shFilters
- * are filled, which makes indexing simple.
-
- * nvcc is behaving in a completely insane way: removing this condition
- under
- * template parameters that guarantee it to be true actually slows down
- * the computation.
- *
- */
-
- /* preload image and filter pixels' data */
- if ((oc + colorCache) ==
- numFilterColors) { // move to next pixel when all colors of current
- // pixel have been finished
- int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX;
- int imgYn = imgY + (imgXn != (imgX + 1));
-
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- fPreload[c * filtersPerThread / B_X] = filters
- [(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize +
- (imgXn - imgLoadModPosX)) *
- numFilters];
- }
-
- if (!checkImgBounds || myImgIdx < numImages) {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = images
- [(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = 0;
- }
- }
- } else { // move next colorCache
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_X / filtersPerThread) {
- fPreload[c * filtersPerThread / B_X] = filters
- [((c + oc + colorCache) * filterPixels +
- (imgY - imgLoadModPosY) * filterSize +
- (imgX - imgLoadModPosX)) *
- numFilters];
- }
-
- if (!checkImgBounds || myImgIdx < numImages) {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = images
- [((c + oc + colorCache) * imgPixels + imgY * imgSizeX +
- imgX) *
- imgStride];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
- iPreload[c * imgsPerThread / B_Y] = 0;
- }
- }
- }
-
- __syncthreads();
-
- // convolution
- for (int c = 0; c < colorCache / 2; c++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[f][g] +=
- shImages[c][g * B_X + tx] * shFilters[c][ty + f * B_Y].x;
- prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] *
- shFilters[c][ty + f * B_Y].y;
- }
- }
- }
- __syncthreads();
- }
- }
- }
-
- // write convolution result into global memory
- if (scale) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets *
- targets[g * B_X + f * B_Y * numImages * numModules] +
- scaleOutputs * prod[f][g];
- }
- }
- }
- } else {
-// Note: reversing order of these loops saves 2 registers, but costs time
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int g = 0; g < imgsPerThread; g++) {
- if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
- targets[g * B_X + f * B_Y * numImages * numModules] =
- scaleOutputs * prod[f][g];
- }
- }
- }
- }
-}
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModules, numFilterColors, filterPixels, numFilters)
- * otherwise
- *
- * targets: (numFilters, numModules, numImages)
- *
- * Note: all of these convolution routines are optimized for the case when
- * the number of images (i.e. the minibatch size) is a multiple of 128.
- * Other batch sizes will work, but but I made no attempt whatsoever
- * to make them work fast.
- */
-void _filterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput,
- bool conv) {
- CAFFE_ENFORCE(images->ndim() == 2);
- CAFFE_ENFORCE(filters->ndim() == 2);
- CAFFE_ENFORCE(targets->ndim() == 2);
-
- int numFilterColors = numImgColors / numGroups;
- int numFilters = filters->dim32(1);
- int numModules = numModulesY * numModulesX;
- int numImages = images->dim32(1);
- int imgPixels = images->dim32(0) / numImgColors;
- int imgSizeX = imgPixels / imgSizeY;
- int filterModuleMult = conv ? 1 : numModules;
-
- CAFFE_ENFORCE(
- numGroups > 1 ||
- (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0)));
- CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0);
- CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0);
- CAFFE_ENFORCE(numImgColors % numGroups == 0);
- CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors);
- CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels);
- int numFiltersPerGroup = numFilters / numGroups;
-
- int imgStride = images->dim32(1);
-
- int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors);
- int filterSize = int(sqrt(filterPixels));
- CAFFE_ENFORCE(filterSize * filterSize == filterPixels);
- CAFFE_ENFORCE(
- filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels);
-
- // These routines don't handle the case when only part of the image is visited
- // in the convolution
- CAFFE_ENFORCE(paddingStart <= 0);
- CAFFE_ENFORCE(
- paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
- CAFFE_ENFORCE(
- paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
- CAFFE_ENFORCE(moduleStride <= filterSize);
-
- int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- int filtersPerThread, threadsY = 4;
- if (numImgColors <= 3) {
- // Special kernels written for colors = 3, filters = 64 and colors = 3,
- // filters = 48 cases. The remaining cases use the old routines.
- // TODO: Modernize the remaining cases if you care about them.
- filtersPerThread = numFiltersPerGroup % 64 == 0
- ? 16
- : numFiltersPerGroup % 48 == 0 ? 12
- : numFiltersPerGroup % 32 == 0 ? 8 : 4;
- } else {
- filtersPerThread = numFiltersPerGroup % 64 == 0
- ? 16
- : numFiltersPerGroup % 32 == 0 ? 8 : 4;
- threadsY = numFiltersPerGroup % 128 == 0 && numFilterColors % 8 == 0 &&
- imgsPerThread != 4
- ? 8
- : 4;
- }
- int threadsX = 32;
- dim3 threads(threadsX, threadsY);
- dim3 blocks = dim3(
- DIVUP(numImages, threads.x * imgsPerThread),
- (numModules * numFilters) / (threads.y * filtersPerThread));
-
- bool checkImgBounds = numImages % (threads.x * imgsPerThread) != 0;
- bool scale = scaleTargets != 0;
- if (scaleTargets == 0) {
- targets->Resize(std::vector<int>{numFilters * numModules, numImages});
- } else {
- CAFFE_ENFORCE(targets->dim32(0) == numFilters * numModules);
- CAFFE_ENFORCE(targets->dim32(1) == numImages);
- }
-
- float* images_data = images->mutable_data<float>();
- float* filters_data = filters->mutable_data<float>();
- float* targets_data = targets->mutable_data<float>();
- const std::size_t images_bytes = images->nbytes();
-
- cudaStream_t stream = context->cuda_stream();
-
- checkCudaErrors(cudaDeviceSetSharedMemConfig(
- cudaSharedMemBankSizeEightByte)); // using wider band width
-
- // Auto-generated calling code...
- // NOTE: The calling code is set up such that if checkImgBounds is true, then
- // imgsPerThread = 1. In principle it doesn't have to be this way, and you may
- // want to optimize for that case.
-
- if (scale == false) {
- if (checkImgBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- if (images_bytes < TEXTURE_SIZE_MAX) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numFiltersPerGroup % 64 == 0) {
- if (images_bytes < TEXTURE_SIZE_MAX) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 4, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 4, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors % 4 == 0) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 8, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 4, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 8, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 4, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 3) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 16,
- 3,
- 4,
- false,
- false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 16,
- 3,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 12,
- 3,
- 4,
- false,
- false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 12,
- 3,
- 4,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 2) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 16, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 12, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 1) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 16, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 12, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- }
- } else if (checkImgBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<8, 32, 1, 16, 8, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 8, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 8, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors % 4 == 0) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 3) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 2) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 1) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- }
- }
- } else if (scale == true) {
- if (checkImgBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- if (images_bytes < TEXTURE_SIZE_MAX) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numFiltersPerGroup % 64 == 0) {
- if (images_bytes < TEXTURE_SIZE_MAX) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false>,
- cudaFuncCachePreferL1);
- filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4<
- 4,
- 32,
- 4,
- 16,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 4, 8, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 4, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<8, 32, 2, 16, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 16, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 2, 8, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 4, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<8, 32, 1, 16, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 16, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2_f_vec<4, 32, 1, 8, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors % 4 == 0) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 8, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 4, 4, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 8, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 2, 4, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 3) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 16,
- 3,
- 4,
- true,
- false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_16_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 16,
- 3,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_filters = GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 12,
- 3,
- 4,
- true,
- false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color_preload_ty_4_tx_32_i_4_f_12_px_4_cc_3_tex<
- 4,
- 32,
- 4,
- 12,
- 3,
- 4,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_filters,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 2) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 16, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 12, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 1) {
- if (numImages % 128 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 16, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 12, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 8, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 4, 4, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 64 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 16, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 12, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 8, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 2, 4, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- } else if (numImages % 32 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- }
- } else if (checkImgBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<8, 32, 1, 16, 8, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 8, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 8, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 8, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors % 4 == 0) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 16, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 8, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_sparse2<4, 32, 1, 4, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 3) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 3, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 3, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 3, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 3, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 2) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 2, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 2, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 2, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 2, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- } else if (numFilterColors == 1) {
- if (numImages % 1 == 0) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 16, 1, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 12, 1, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 8, 1, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- } else if (numFiltersPerGroup % 1 == 0) {
- cudaFuncSetCacheConfig(
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true>,
- cudaFuncCachePreferShared);
- filterActs_YxX_color<4, 32, 1, 4, 1, 4, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- filters_data,
- targets_data,
- numImages,
- numFilters,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- numModulesY,
- numModulesX,
- imgStride,
- scaleTargets,
- scaleOutput,
- conv);
- }
- }
- }
- }
- }
-
- checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
- getLastCudaError("filterActs: kernel execution failed");
-}
-
-void convFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups) {
- convFilterActs(
- context,
- images,
- filters,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 0,
- 1);
-}
-
-void convFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput) {
- _filterActs(
- context,
- images,
- filters,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- true);
-}
-
-void localFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups) {
- localFilterActs(
- context,
- images,
- filters,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 0,
- 1);
-}
-
-void localFilterActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput) {
- _filterActs(
- context,
- images,
- filters,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- false);
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include "../include/cudaconv2.cuh"
-
-/*
- * Block size: 16x16.
- * blockIdx.x determines case in batches of 16*imgsPerThread.
- * blockIdx.y determines 4x4 image region in target image.
- *
- * threadIdx.x determines case.
- * threadIdx.y determines pixel.
- *
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- * filters: (numColors, filterPixels, numFilters) if conv (numModulesY,
- * numModulesX, numColors, filterPixels, numFilters) otherwise targets:
- * (numColors, imgSizeY, imgSizeX, numImages)
- *
- * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
- *
- * Number of filters must be divisible by 16.
- * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is
- * false. 16 * imgsPerThread must be divisible by 32.
- *
- * This version loads 32 cases at a time, so it gets full coalescing on that
- * load. It only loads 16 weights at a time, so those aren't fully coalesced.
- * This version conserves shared memory by loading 16 filters at a time rather
- * than 32.
- */
-template <
- int imgsPerThread,
- int numColors,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void img_acts_color(
- const float* hidActs,
- const float* filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[numColors * 16][16 + 1];
- __shared__ float shHidActs[16][16 * imgsPerThread];
-
- const int blockCaseIdx = blockIdx.x * 16 * imgsPerThread;
- const int numRegionsX = DIVUP(imgSizeX, 4);
- const int blockRegionIdx = blockIdx.y;
- const int blockRegionIdxX = blockRegionIdx % numRegionsX;
- const int blockRegionIdxY = blockRegionIdx / numRegionsX;
- const int blockRegionLeft = blockRegionIdxX * 4;
- const int blockRegionTop = blockRegionIdxY * 4;
- const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
- const int pxY = blockRegionTop + pxYInRegion;
- const int pxX = blockRegionLeft + pxXInRegion;
- const int pxIdx = pxY * imgSizeX + pxX;
- const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
- const int numModules = numModulesY * numModulesX;
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeX * imgSizeY;
- const int tidx = threadIdx.y * 16 + threadIdx.x;
- const int loadY = tidx / 32, loadX = tidx % 32;
-
- hidActs += blockCaseIdx + loadY * numImages * numModules + loadX;
- filters += threadIdx.x;
- targets += pxIdx * numImages + blockCaseIdx + threadIdx.x;
-
- float prod[numColors][imgsPerThread];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] = 0;
- }
- }
- const int startY = blockRegionTop - paddingStart < filterSize
- ? 0
- : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
- const int endY =
- MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
- const int startX = blockRegionLeft - paddingStart < filterSize
- ? 0
- : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
- const int endX =
- MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
-
- float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x];
- float* shHidActLoad = &shHidActs[loadY][loadX];
-
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInModuleY = pxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- const int moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInModuleX = pxX - moduleLeft;
-
- const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize &&
- pxInModuleX >= 0 && pxInModuleX < filterSize;
- const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
-
- for (int f = 0; f < numFilters;
- f += 16) { // multiply with 16 filters at a time
- // Now the threads split up into half-warps, and each half-warp decides
- // if it's interested.
- const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
-#pragma unroll
- for (int i = 0; i < imgsPerThread * 16; i += 32) {
- if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) {
-#pragma unroll
- for (int j = 0; j < 16;
- j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
- // elements at a time.
- shHidActLoad[j * 16 * imgsPerThread + i] =
- hLoad[j * numModules * numImages + i];
- }
- } else {
-#pragma unroll
- for (int j = 0; j < 16;
- j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
- // elements at a time.
- shHidActLoad[j * 16 * imgsPerThread + i] = 0;
- }
- }
- }
-
- if (isPxInImg && isPxInModule) {
- // This half-warp is interested, so it's going to load the weights
- // from this module to its pixel. Not fully coalesced read :( But
- // taking out this read entirely only reduces the runtime by ~2.8%, so
- // it isn't costing me much.
- const float* fLoad = conv
- ? &filters[pxIdxInModule * numFilters + f]
- : &filters
- [(moduleIdx * numColors * filterPixels + pxIdxInModule) *
- numFilters +
- f];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shilterLoad[c * 16 * (16 + 1)] =
- fLoad[c * filterPixels * numFilters];
- }
- }
-
- __syncthreads();
- // Do some actual computation
- if (isPxInImg && isPxInModule) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int w = 0; w < 16; w++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] += shFilters[threadIdx.y + c * 16][w] *
- shHidActs[w][threadIdx.x + i * 16];
- }
- }
- }
- }
- __syncthreads();
- }
- }
- }
- // Not fully coalesced write :(... shmem (and fully coalesced) version is
- // actually slightly slower, though
- if (isPxInImg) {
- if (scale) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * 16 < numImages) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- targets[c * imgPixels * numImages + i * 16] =
- scaleTargets * targets[c * imgPixels * numImages + i * 16] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * 16 < numImages) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- targets[c * imgPixels * numImages + i * 16] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
- }
-}
-
-/*
- * Block size: 16x16.
- * blockIdx.x determines case in batches of 16*imgsPerThread, also color in
- * batches of colorsPerThread. In essence, blockIdx.x.x
- * = 1..numImages/(16*imgsPerThread) blockIdx.x.y
- * = 1..numImgColors/colorsPerThread blockIdx.y determines 4x4 image region in
- * target image.
- *
- * threadIdx.x determines case.
- * threadIdx.y determines pixel.
- *
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModulesY, numModulesX, numFilterColors, filterPixels,
- * numFilters) otherwise targets: (numImageColors, imgSizeY, imgSizeX,
- * numImages)
- *
- * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
- *
- * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
- * 16 * imgsPerThread must be divisible by 32.
- * numImageColors/numGroups must be divisible by colorsPerThread.
- *
- * This version loads 32 cases at a time, so it gets full coalescing on that
- * load. It only loads 16 weights at a time, so those aren't fully coalesced.
- * This version conserves shared memory by loading 16 filters at a time rather
- * than 32.
- *
- * To be used when there are 4-16 color channels.
- */
-template <
- int imgsPerThread,
- int colorsPerThread,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void img_acts_mediumcolor(
- const float* hidActs,
- const float* filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[colorsPerThread * 16][16 + 1];
- __shared__ float shHidActs[16][16 * imgsPerThread];
-
- const int numImgBlocks = DIVUP(numImages, 16 * imgsPerThread);
- const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16 * imgsPerThread;
-
- const int imgColorIdx =
- (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
- const int numFilterColors = numImgColors / numGroups;
- const int blockGroupIdx = imgColorIdx / numFilterColors;
- const int filterColorIdx =
- imgColorIdx % numFilterColors; // color idx within group
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
-
- const int numRegionsX = DIVUP(imgSizeX, 4);
- const int blockRegionIdx = blockIdx.y;
- const int blockRegionIdxX = blockRegionIdx % numRegionsX;
- const int blockRegionIdxY = blockRegionIdx / numRegionsX;
- const int blockRegionLeft = blockRegionIdxX * 4;
- const int blockRegionTop = blockRegionIdxY * 4;
- const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
- const int pxY = blockRegionTop + pxYInRegion;
- const int pxX = blockRegionLeft + pxXInRegion;
- const int pxIdx = pxY * imgSizeX + pxX;
- const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
- const uint numModules = numModulesY * numModulesX;
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
- const int tidx = threadIdx.y * 16 + threadIdx.x;
- const int loadY = tidx / 32, loadX = tidx % 32;
-
- hidActs +=
- blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
- filters +=
- blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
- targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages +
- blockCaseIdx + threadIdx.x;
-
- float prod[colorsPerThread][imgsPerThread];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] = 0;
- }
- }
- const int startY = blockRegionTop - paddingStart < filterSize
- ? 0
- : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
- const int endY =
- MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
- const int startX = blockRegionLeft - paddingStart < filterSize
- ? 0
- : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
- const int endX =
- MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
-
- float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
- float* shHidActLoad = &shHidActs[loadY][loadX];
-
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInModuleY = pxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- const int moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInModuleX = pxX - moduleLeft;
-
- const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize &&
- pxInModuleX >= 0 && pxInModuleX < filterSize;
- const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
-
- for (int f = 0; f < numFiltersPerGroup;
- f += 16) { // multipply with 16 filters at a time
- // Now the threads split up into half-warps, and each half-warp decides
- // if it's interested.
- const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
-#pragma unroll
- for (int i = 0; i < imgsPerThread * 16; i += 32) {
- if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
-#pragma unroll
- for (int j = 0; j < 16;
- j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
- // elements at a time.
- shHidActLoad[j * 16 * imgsPerThread + i] =
- hLoad[j * numModules * numImages + i];
- }
- } else {
-#pragma unroll
- for (int j = 0; j < 16;
- j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32
- // elements at a time.
- shHidActLoad[j * 16 * imgsPerThread + i] = 0;
- }
- }
- }
-
- if (isPxInImg && isPxInModule) {
- // This half-warp is interested, so it's going to load the weights
- // from this module to its pixel.
-
- // Not fully coalesced read :(
- // But taking out this read entirely only reduces the runtime by
- // ~2.8%, so it isn't costing me much.
- const float* fLoad = conv
- ? &filters[pxIdxInModule * numFilters + f]
- : &filters
- [moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInModule * numFilters + f];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- shFilterLoad[c * 16 * (16 + 1)] =
- fLoad[c * filterPixels * numFilters];
- }
- }
-
- __syncthreads();
- // Do some actual computation
- if (isPxInImg && isPxInModule) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int w = 0; w < 16; w++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] += shFilters[threadIdx.y + c * 16][w] *
- shHidActs[w][threadIdx.x + i * 16];
- }
- }
- }
- }
- __syncthreads();
- }
- }
- }
- // Not fully coalesced write :(... shmem (and fully coalesced) version is
- // actually slightly slower, though
- if (isPxInImg) {
- if (scale) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * 16 < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * imgPixels * numImages + i * 16] =
- scaleTargets * targets[c * imgPixels * numImages + i * 16] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * 16 < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * imgPixels * numImages + i * 16] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
- }
-}
-
-/*
- * Block size: B_YxB_X.
- * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in
- batches of B_Y*colorsPerThread.
- * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
- * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
- * blockIdx.y determines image pixel in target image.
- *
- * threadIdx.x determines case.
- * threadIdx.y determines color.
- *
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModulesY, numModulesX, numFilterColors, filterPixels,
- numFilters) otherwise
- * targets: (numImageColors, imgSizeY, imgSizeX, numImages)
- *
- * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from
- B_X*imgsPerThread cases.
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
- * numFiltersPerGroup must be divisible by filterCache.
- *
- * B_X * imgsPerThread must be divisible by 32.
- * numFilterColors must be divisible by B_Y*colorsPerThread.
- * B_X*B_Y must be divisible by 32.
- * filterCache must be divisible by B_X*B_Y/32
- * B_X*B_Y must be divisible by filterCache
-
- * This version loads 32 cases at a time, so it gets full coalescing on that
- load.
- * It only loads filterCache weights at a time, so those aren't fully coalesced
- (depending on size of filterCache).
- *
- * To be used when there are >= 16 color channels.
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int colorsPerThread,
- int filterCache,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void conv_img_acts_manycolor(
- const float* hidActs,
- const float* filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[colorsPerThread * B_Y][filterCache + 1];
- __shared__ float shHidActs[filterCache][B_X * imgsPerThread];
-
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
-
- const int imgColorIdx =
- (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
- const int numFilterColors = numImgColors / numGroups;
- const int blockGroupIdx = imgColorIdx / numFilterColors;
- const int filterColorIdx =
- imgColorIdx % numFilterColors; // color idx within group
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
-
- const int blockPixelIdx = blockIdx.y;
- const int blockPixelIdxX = blockPixelIdx % imgSizeX;
- const int blockPixelIdxY = blockPixelIdx / imgSizeX;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
- const int filtersLoadY = tidx / filterCache,
- filtersLoadX = tidx % filterCache;
- const int numModules = numModulesY * numModulesX;
-
- hidActs += blockCaseIdx +
- (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
- filters += blockFilterIdx +
- (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
- filtersLoadX;
- targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
- blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
-
- float prod[colorsPerThread][imgsPerThread];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] = 0;
- }
- }
-
- const int startY = blockPixelIdxY - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
- const int endY =
- MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
- const int startX = blockPixelIdxX - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
- const int endX =
- MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
-
- float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
- float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
-
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInFilterY = blockPixelIdxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- const int moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInFilterX = blockPixelIdxX - moduleLeft;
-
- const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
-
- for (int f = 0; f < numFiltersPerGroup;
- f += filterCache) { // multiply with filterCache filters at a time
- const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
-#pragma unroll
- for (int i = 0; i < imgsPerThread * B_X; i += 32) {
- if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
-#pragma unroll
- for (int j = 0; j < filterCache; j +=
- B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X
- // cols, 8 * 32 elements at a time.
- shHidActLoad[j * B_X * imgsPerThread + i] =
- hLoad[j * numModules * numImages + i];
- }
- } else {
-#pragma unroll
- for (int j = 0; j < filterCache; j +=
- B_X * B_Y / 32) { // load filterCache rows of imgsPerThread*B_X
- // cols, 8 * 32 elements at a time.
- shHidActLoad[j * B_X * imgsPerThread + i] = 0;
- }
- }
- }
- const float* fLoad = conv
- ? &filters[pxIdxInFilter * numFilters + f]
- : &filters
- [moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters + f];
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y;
- i += B_X * B_Y / filterCache) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCache) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- shFilterLoad[i * (filterCache + 1)] =
- fLoad[i * filterPixels * numFilters];
- }
- }
-
- __syncthreads();
-// Do some actual computation
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int w = 0; w < filterCache; w++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] *
- shHidActs[w][threadIdx.x + i * B_X];
- }
- }
- }
- __syncthreads();
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
- targets[c * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
-}
-
-/*
- * Block size: B_YxB_X.
- * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in
- * batches of B_Y*colorsPerThread. In essence, blockIdx.x.x
- * = 1..numImages/(B_X*imgsPerThread) blockIdx.x.y
- * = 1..numImgColors/(B_Y*colorsPerThread) blockIdx.y determines image pixel in
- * target image.
- *
- * threadIdx.x determines case.
- * threadIdx.y determines color.
- *
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- * filters: (numFilterColors, filterPixels, numFilters) if conv
- * (numModulesY, numModulesX, numFilterColors, filterPixels,
- * numFilters) otherwise targets: (numImageColors, imgSizeY, imgSizeX,
- * numImages)
- *
- * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from
- * B_X*imgsPerThread cases.
- *
- * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
- * numFiltersPerGroup must be divisible by filterCacheF.
- *
- * numFilterColors must be divisible by B_Y*colorsPerThread.
- * B_X*B_Y must be divisible by filterCacheF
- * filterCacheF must be divisible by filterCacheH
- *
- * This version loads 32 cases at a time, so it gets full coalescing on that
- * load. It only loads filterCacheF weights at a time, so those aren't fully
- * coalesced (depending on size of filterCacheF).
- *
- * To be used when there are >= 16 color channels.
- */
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int colorsPerThread,
- int filterCacheF,
- int filterCacheH,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void conv_img_acts_manycolor_kepler(
- const float* hidActs,
- const float* filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
- __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
-
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
-
- const int imgColorIdx =
- (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
- const int numFilterColors = numImgColors / numGroups;
- const int blockGroupIdx = imgColorIdx / numFilterColors;
- const int filterColorIdx =
- imgColorIdx % numFilterColors; // color idx within group
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
-
- const int blockPixelIdx = blockIdx.y;
- const int blockPixelIdxX = blockPixelIdx % imgSizeX;
- const int blockPixelIdxY = blockPixelIdx / imgSizeX;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- const int hidActLoadY = threadIdx.y, hidActLoadX = threadIdx.x;
- // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
- // (B_X*imgsPerThread);
- const int filtersLoadY = tidx / filterCacheF,
- filtersLoadX = tidx % filterCacheF;
- // nvcc is behaving idiotically again, these useless declarations save
- // registers
- // const int outputY = threadIdx.y, outputX = threadIdx.x;
- // const int ty = threadIdx.y, tx = threadIdx.x;
- const int numModules = numModulesY * numModulesX;
-
- hidActs += blockCaseIdx +
- (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
- filters += blockFilterIdx +
- (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
- filtersLoadX;
- targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
- blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
-
- float prod[colorsPerThread][imgsPerThread];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] = 0;
- }
- }
-
- const int startY = blockPixelIdxY - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
- const int endY =
- min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
- const int startX = blockPixelIdxX - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
- const int endX =
- min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
-
- float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
- float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
- // const bool noFLoop = filterCacheF == filterCacheH;
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInFilterY = blockPixelIdxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- const int moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInFilterX = blockPixelIdxX - moduleLeft;
-
- const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
-
- for (int f = 0; f < numFiltersPerGroup;
- f += filterCacheF) { // multiply with filterCacheF filters at a time
- const float* fLoad = conv
- ? &filters[pxIdxInFilter * numFilters + f]
- : &filters
- [moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters + f];
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y;
- i += B_X * B_Y / filterCacheF) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- shFilterLoad[i * filterCacheF] =
- fLoad[i * filterPixels * numFilters];
- }
- }
- //#pragma unroll
-
- for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) {
- // conv_img_acts_manycolor_dummy_fhLoop<B_Y, B_X, imgsPerThread,
- // colorsPerThread, filterCacheF, filterCacheH,
- // checkCaseBounds>(hidActs, shHidActLoad, shHidActs, shFilters,
- // moduleIdx, numImages, hidActLoadY, hidActLoadX, blockCaseIdx,
- // numModules, f, fh, prod);
-
- const float* hLoad =
- &hidActs[(moduleIdx + fh * numModules) * numImages];
-
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || hidActLoadY + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread * B_X; i += B_X) {
- if (!checkCaseBounds ||
- blockCaseIdx + hidActLoadX + i < numImages) {
- shHidActLoad[j * B_X * imgsPerThread + i] =
- hLoad[j * numModules * numImages + i];
- } else {
- shHidActLoad[j * B_X * imgsPerThread + i] = 0;
- }
- }
- }
- }
-
- __syncthreads();
-
-// Do some actual computation
-// Using these variables causes register usage to go from 161 --> 123.
-// But nonetheless, the high-register version is faster.
-// const float* shF = &shFilters[threadIdx.y][fh-f];
-// const float* const shF2 = &shFilters[threadIdx.y][fh];
-// const float* shH = &shHidActs[0][threadIdx.x];
-#pragma unroll
- for (int w = 0; w < filterCacheH; w++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- prod[c][i] += shFilters[c * B_Y + threadIdx.y][fh - f + w] *
- shHidActs[w][threadIdx.x + i * B_X];
- }
- }
- }
- __syncthreads();
- }
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
- targets[c * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds ||
- blockCaseIdx + threadIdx.x + i * B_X < numImages) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
-}
-
-/*
- * New Titan-optimized stuff.
- */
-
-__device__ __forceinline__ void
-conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
- const int my,
- const int mx,
- const int numModulesX,
- const int paddingStart,
- const int moduleStride,
- const int blockPixelIdxY,
- const int blockPixelIdxX,
- const int filterSize,
- int& moduleIdx,
- int& pxIdxInFilter) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInFilterY = blockPixelIdxY - moduleTop;
-
- moduleIdx = my * numModulesX + mx; // out
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInFilterX = blockPixelIdxX - moduleLeft;
-
- pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; // out
-}
-
-#define IA_PRELOAD_LOOP(w, offset) \
- _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) { \
- _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
- prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
- shHidActs[w][threadIdx.x * imgsPerThread + i]; \
- } \
- }
-
-/*
- * Same loop as above but inverted.
- */
-#define IA_PRELOAD_LOOP2(w, offset) \
- _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
- _Pragma("unroll") for (int i = 0; i < imgsPerThread; i++) { \
- prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
- shHidActs[w][threadIdx.x * imgsPerThread + i]; \
- } \
- }
-
-#define IA_PRELOAD_LOOP3(i, offset) \
- _Pragma("unroll") for (int w = 0; w < filterCacheH; w++) { \
- _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
- prod[c][i] += shFilters[c * B_Y + threadIdx.y][(w) + (offset)] * \
- shHidActs[w][threadIdx.x * imgsPerThread + i]; \
- } \
- }
-
-#define IA_PRELOAD_W(z) \
- wPreload[z] = fLoad[(z)*B_X * B_Y / filterCacheF * filterPixels * numFilters];
-#define IA_PRELOAD_W_TX(z) \
- wPreload[z] = tex1Dfetch<float>( \
- filters, \
- filtersLoadOffset + \
- (z)*B_X * B_Y / filterCacheF * filterPixels * numFilters);
-#define IA_PRELOAD_H(y, x) \
- if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) { \
- hPreload[y][x] = hLoad[(y)*B_Y * numModules * numImages + (x)*B_X]; \
- }
-#define IA_PRELOAD_H_TX(y, x) \
- if (!checkCaseBounds || myCaseIdx + (x)*B_X < numImages) { \
- hPreload[y][x] = tex1Dfetch<float>( \
- hidActs, \
- hidActsLoadOffset + (y)*B_Y * numModules * numImages + (x)*B_X); \
- }
-
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int colorsPerThread,
- int filterCacheF,
- int filterCacheH,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void __launch_bounds__(
- 256,
- 2) // 256 threads per block, 2 blocks per multiprocessor
- // These launch bounds ensure 25% occupancy (128 registers used)
- // as oppposed to 13% (130 registers) achieved by defaults.
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(
- cudaTextureObject_t hidActs,
- cudaTextureObject_t filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
- __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
-
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int myCaseIdx = blockCaseIdx + threadIdx.x;
-
- const int imgColorIdx =
- (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
- const int numFilterColors = numImgColors / numGroups;
- const int blockGroupIdx = imgColorIdx / numFilterColors;
- const int filterColorIdx =
- imgColorIdx % numFilterColors; // color idx within group
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
-
- const int blockPixelIdx = blockIdx.y;
- const int blockPixelIdxX = blockPixelIdx % imgSizeX;
- const int blockPixelIdxY = blockPixelIdx / imgSizeX;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x %
- // B_X;
- // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
- // (B_X*imgsPerThread);
- const int filtersLoadY = tidx / filterCacheF,
- filtersLoadX = tidx % filterCacheF;
- // nvcc is behaving idiotically again, these useless declarations save
- // registers
- // const int outputY = threadIdx.y, outputX = threadIdx.x;
- // const int ty = threadIdx.y, tx = threadIdx.x;
- const int numModules = numModulesY * numModulesX;
- const int hidActsOffset =
- (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
- const int filtersOffset = blockFilterIdx +
- (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
- filtersLoadX;
- // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules +
- // myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) *
- // filterPixels * numFilters + filtersLoadX;
- targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
- blockPixelIdx * numImages + myCaseIdx;
-
- float prod[colorsPerThread][imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[c][i] = 0;
- }
- }
-
- const int startY = blockPixelIdxY - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
- const int endY =
- min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
- const int startX = blockPixelIdxX - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
- const int endX =
- min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
-
- float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
- float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
- // const bool noFLoop = filterCacheF == filterCacheH;
-
- /*
- * Initial preload
- */
- float hPreload[filterCacheH / B_Y][imgsPerThread]; // [2][4]
- float wPreload[filterCacheF * colorsPerThread / B_X]; // [8]
-
- int moduleIdx, pxIdxInFilter;
- conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
- startY,
- startX,
- numModulesX,
- paddingStart,
- moduleStride,
- blockPixelIdxY,
- blockPixelIdxX,
- filterSize,
- moduleIdx,
- pxIdxInFilter);
- // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0]
- // : &filters[moduleIdx * numFilterColors *
- // filterPixels * numFilters + pxIdxInFilter *
- // numFilters + 0];
- int filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilter * numFilters + 0
- : moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters);
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch<float>(
- filters, filtersLoadOffset + i * filterPixels * numFilters);
- }
- }
-
- // const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages];
- int hidActsLoadOffset =
- hidActsOffset + (moduleIdx + 0 * numModules) * numImages;
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- hPreload[j / B_Y][i] = tex1Dfetch<float>(
- hidActs,
- hidActsLoadOffset + j * numModules * numImages + i * B_X);
- }
- }
- }
- }
-
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInFilterY = blockPixelIdxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInFilterX = blockPixelIdxX - moduleLeft;
-
- pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
- int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext;
- const bool lastModule = my == endY - 1 && mx == endX - 1;
- if (!lastModule) {
- mxNext = mx + 1 == endX ? startX : mx + 1;
- myNext = my + (mx + 1 == endX);
- }
- conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
- myNext,
- mxNext,
- numModulesX,
- paddingStart,
- moduleStride,
- blockPixelIdxY,
- blockPixelIdxX,
- filterSize,
- moduleIdxNext,
- pxIdxInFilterNext);
- for (int f = 0; f < numFiltersPerGroup;
- f += filterCacheF) { // multiply with filterCacheF filters at a time
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y;
- i += B_X * B_Y / filterCacheF) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- shFilterLoad[i * filterCacheF] =
- wPreload[i * filterCacheF / (B_X * B_Y)];
- }
- }
-
- filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilter * numFilters + f + filterCacheF
- : moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters + f + filterCacheF);
- if (f == numFiltersPerGroup - filterCacheF) {
- filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilterNext * numFilters
- : moduleIdxNext * numFilterColors * filterPixels *
- numFilters +
- pxIdxInFilterNext * numFilters);
- }
-
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- // NOTE: bank conflicts here!
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- shHidActLoad[j * B_X * imgsPerThread + i] =
- hPreload[j / B_Y][i];
- }
- }
- }
- }
-
- __syncthreads();
-
- hidActsLoadOffset = hidActsOffset +
- (moduleIdx + (f + filterCacheH) * numModules) * numImages;
-
-#pragma unroll
- for (int z = 0; z < 4; ++z) {
- IA_PRELOAD_LOOP(z, 0);
- IA_PRELOAD_W_TX(z);
- }
-
-#pragma unroll
- for (int z = 4; z < 12; ++z) {
- IA_PRELOAD_LOOP(z, 0);
- IA_PRELOAD_H_TX((z - 4) / 4, z % 4);
- }
-
-#pragma unroll
- for (int z = 12; z < 16; ++z) {
- IA_PRELOAD_LOOP(z, 0);
- }
-
- __syncthreads();
-
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- shHidActLoad[j * B_X * imgsPerThread + i] =
- hPreload[j / B_Y][i];
- }
- }
- }
- }
-
- __syncthreads();
-
- hidActsLoadOffset = hidActsOffset +
- (moduleIdx + (f + filterCacheF) * numModules) * numImages;
- if (f == numFiltersPerGroup - filterCacheF) {
- hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages;
- }
-
-#pragma unroll
- for (int z = 0; z < 4; ++z) {
- IA_PRELOAD_LOOP(z, filterCacheH);
- IA_PRELOAD_W_TX(z + 4);
- }
-
-#pragma unroll
- for (int z = 4; z < 12; ++z) {
- IA_PRELOAD_LOOP(z, filterCacheH);
- IA_PRELOAD_H_TX((z - 4) / 4, z % 4);
- }
-
-#pragma unroll
- for (int z = 12; z < 16; ++z) {
- IA_PRELOAD_LOOP(z, filterCacheH);
- }
-
- __syncthreads();
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
- targets[c * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
-}
-
-template <
- int B_Y,
- int B_X,
- int imgsPerThread,
- int colorsPerThread,
- int filterCacheF,
- int filterCacheH,
- bool scale,
- bool checkCaseBounds,
- bool conv>
-__global__ void
-//__launch_bounds__(128, 3) // 128 threads per block, 3 blocks per
-// multiprocessor
-conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16(
- cudaTextureObject_t hidActs,
- cudaTextureObject_t filters,
- float* targets,
- const int numModulesY,
- const int numModulesX,
- const int numImages,
- const int numFilters,
- const int filterSize,
- const int imgSizeY,
- const int imgSizeX,
- const int paddingStart,
- const int moduleStride,
- const int numImgColors,
- const int numGroups,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shFilters[colorsPerThread * B_Y][filterCacheF];
- __shared__ float shHidActs[filterCacheH][B_X * imgsPerThread];
-
- const int numImgBlocks = DIVUP(numImages, B_X * imgsPerThread);
- const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
- const int myCaseIdx = blockCaseIdx + threadIdx.x;
-
- const int imgColorIdx =
- (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
- const int numFilterColors = numImgColors / numGroups;
- const int blockGroupIdx = imgColorIdx / numFilterColors;
- const int filterColorIdx =
- imgColorIdx % numFilterColors; // color idx within group
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
-
- const int blockPixelIdx = blockIdx.y;
- const int blockPixelIdxX = blockPixelIdx % imgSizeX;
- const int blockPixelIdxY = blockPixelIdx / imgSizeX;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
- const int tidx = threadIdx.y * B_X + threadIdx.x;
- // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x %
- // B_X;
- // const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx %
- // (B_X*imgsPerThread);
- const int filtersLoadY = tidx / filterCacheF,
- filtersLoadX = tidx % filterCacheF;
- // nvcc is behaving idiotically again, these useless declarations save
- // registers
- // const int outputY = threadIdx.y, outputX = threadIdx.x;
- // const int ty = threadIdx.y, tx = threadIdx.x;
- const int numModules = numModulesY * numModulesX;
-
- const int hidActsOffset =
- (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
- const int filtersOffset = blockFilterIdx +
- (filterColorIdx + filtersLoadY) * filterPixels * numFilters +
- filtersLoadX;
-
- // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules +
- // myCaseIdx; filters += blockFilterIdx + (filterColorIdx + filtersLoadY) *
- // filterPixels * numFilters + filtersLoadX;
- targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages +
- blockPixelIdx * numImages + myCaseIdx;
-
- float prod[colorsPerThread][imgsPerThread];
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[c][i] = 0;
- }
- }
-
- const int startY = blockPixelIdxY - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
- const int endY =
- min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
- const int startX = blockPixelIdxX - paddingStart < filterSize
- ? 0
- : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
- const int endX =
- min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
-
- float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
- float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
- // const bool noFLoop = filterCacheF == filterCacheH;
-
- /*
- * Initial preload
- */
- float hPreload[filterCacheH / B_Y][imgsPerThread]; // [4][4]
- float wPreload[filterCacheF * colorsPerThread / B_X]; // [6]
-
- int moduleIdx, pxIdxInFilter;
- conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
- startY,
- startX,
- numModulesX,
- paddingStart,
- moduleStride,
- blockPixelIdxY,
- blockPixelIdxX,
- filterSize,
- moduleIdx,
- pxIdxInFilter);
- // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0]
- // : &filters[moduleIdx * numFilterColors *
- // filterPixels * numFilters + pxIdxInFilter *
- // numFilters + 0];
- int filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilter * numFilters
- : moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters);
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y; i += B_X * B_Y / filterCacheF) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- wPreload[i * filterCacheF / (B_X * B_Y)] = tex1Dfetch<float>(
- filters, filtersLoadOffset + i * filterPixels * numFilters);
- }
- }
-
- // const float* hLoad = &hidActs[moduleIdx * numImages];
- int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages;
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- hPreload[j / B_Y][i] = tex1Dfetch<float>(
- hidActs,
- hidActsLoadOffset + j * numModules * numImages + i * B_X);
- }
- }
- }
- }
-
- for (int my = startY; my < endY; my++) {
- const int moduleTop = paddingStart + my * moduleStride;
- const int pxInFilterY = blockPixelIdxY - moduleTop;
-
- for (int mx = startX; mx < endX; mx++) {
- moduleIdx = my * numModulesX + mx;
- const int moduleLeft = paddingStart + mx * moduleStride;
- const int pxInFilterX = blockPixelIdxX - moduleLeft;
-
- pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
- int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext;
- const bool lastModule = my == endY - 1 && mx == endX - 1;
- if (!lastModule) {
- mxNext = mx + 1 == endX ? startX : mx + 1;
- myNext = my + (mx + 1 == endX);
- }
- conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(
- myNext,
- mxNext,
- numModulesX,
- paddingStart,
- moduleStride,
- blockPixelIdxY,
- blockPixelIdxX,
- filterSize,
- moduleIdxNext,
- pxIdxInFilterNext);
- for (int f = 0; f < numFiltersPerGroup;
- f += filterCacheF) { // multiply with filterCacheF filters at a time
-#pragma unroll
- for (int i = 0; i < colorsPerThread * B_Y;
- i += B_X * B_Y / filterCacheF) {
- if ((colorsPerThread * B_Y) % (B_X * B_Y / filterCacheF) == 0 ||
- i + filtersLoadY < colorsPerThread * B_Y) {
- shFilterLoad[i * filterCacheF] =
- wPreload[i * filterCacheF / (B_X * B_Y)];
- }
- }
-
- filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilter * numFilters + f + filterCacheF
- : moduleIdx * numFilterColors * filterPixels * numFilters +
- pxIdxInFilter * numFilters + f + filterCacheF);
- if (f == numFiltersPerGroup - filterCacheF) {
- filtersLoadOffset = filtersOffset +
- (conv ? pxIdxInFilterNext * numFilters
- : moduleIdxNext * numFilterColors * filterPixels *
- numFilters +
- pxIdxInFilterNext * numFilters);
- }
-
-#pragma unroll
- for (int j = 0; j < filterCacheH; j += B_Y) {
- if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- // NOTE: bank conflicts here!
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- shHidActLoad[j * B_X * imgsPerThread + i] =
- hPreload[j / B_Y][i];
- }
- }
- }
- }
- hidActsLoadOffset = hidActsOffset +
- (moduleIdx + (f + filterCacheF) * numModules) * numImages;
- if (f == numFiltersPerGroup - filterCacheF) {
- hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages;
- }
-
- __syncthreads();
-
- // It seems that there is no point explicitly interleaving loads
- // and computations because the scheduler does that anyway.
-
- IA_PRELOAD_LOOP2(0, 0);
- IA_PRELOAD_LOOP2(1, 0);
- IA_PRELOAD_LOOP2(2, 0);
- IA_PRELOAD_LOOP2(3, 0);
- IA_PRELOAD_LOOP2(4, 0);
- IA_PRELOAD_LOOP2(5, 0);
- IA_PRELOAD_LOOP2(6, 0);
- IA_PRELOAD_LOOP2(7, 0);
- IA_PRELOAD_LOOP2(8, 0);
- IA_PRELOAD_LOOP2(9, 0);
- IA_PRELOAD_LOOP2(10, 0);
- IA_PRELOAD_LOOP2(11, 0);
- IA_PRELOAD_LOOP2(12, 0);
- IA_PRELOAD_LOOP2(13, 0);
- IA_PRELOAD_LOOP2(14, 0);
- IA_PRELOAD_LOOP2(15, 0);
-
- IA_PRELOAD_W_TX(0);
- IA_PRELOAD_W_TX(1);
- IA_PRELOAD_W_TX(2);
- IA_PRELOAD_W_TX(3);
- IA_PRELOAD_W_TX(4);
- IA_PRELOAD_W_TX(5);
-
- IA_PRELOAD_H_TX(0, 0);
- IA_PRELOAD_H_TX(0, 1);
- IA_PRELOAD_H_TX(0, 2);
- IA_PRELOAD_H_TX(0, 3);
- IA_PRELOAD_H_TX(1, 0);
- IA_PRELOAD_H_TX(1, 1);
- IA_PRELOAD_H_TX(1, 2);
- IA_PRELOAD_H_TX(1, 3);
- IA_PRELOAD_H_TX(2, 0);
- IA_PRELOAD_H_TX(2, 1);
- IA_PRELOAD_H_TX(2, 2);
- IA_PRELOAD_H_TX(2, 3);
- IA_PRELOAD_H_TX(3, 0);
- IA_PRELOAD_H_TX(3, 1);
- IA_PRELOAD_H_TX(3, 2);
- IA_PRELOAD_H_TX(3, 3);
-
- __syncthreads();
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets *
- targets[c * B_Y * imgPixels * numImages + i * B_X] +
- scaleOutputs * prod[c][i];
- }
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int i = 0; i < imgsPerThread; i++) {
- if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
- targets[c * B_Y * imgPixels * numImages + i * B_X] =
- scaleOutputs * prod[c][i];
- }
- }
- }
- }
-}
-
-/*
- * hidActs: (numFilters, numModules, numImages)
- * filters: (numFilterColors, filterPixels, numFilters) if
- * conv (numModules, numFilterColors, filterPixels, numFilters) otherwise
- * targets: (overSample, numImgColors, imgPixels, numImages)
- *
- * Note: all of these convolution routines are optimized for the case when
- * the number of images (i.e. the minibatch size) is a multiple of 128.
- * Other batch sizes will work, but but I made no attempt whatsoever
- * to make them work fast.
- */
-void _imgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput,
- bool conv) {
- CAFFE_ENFORCE(hidActs->ndim() == 2);
- CAFFE_ENFORCE(filters->ndim() == 2);
- CAFFE_ENFORCE(targets->ndim() == 2);
-
- int numFilterColors = numImgColors / numGroups;
- int numImages = hidActs->dim32(1);
- int numFilters = filters->dim32(1);
- int numModules = hidActs->dim32(0) / numFilters;
- int filterModuleMult = conv ? 1 : numModules;
- int filterPixels = filters->dim32(0) / (filterModuleMult * numFilterColors);
- int filterSize = sqrt(filterPixels);
- int imgPixels = imgSizeY * imgSizeX;
- int numModulesX = numModules / numModulesY;
-
- CAFFE_ENFORCE(numImgColors % numGroups == 0);
- CAFFE_ENFORCE(
- numFilters % (16 * numGroups) ==
- 0); // TODO: insisting on 32 filters due to bug in calling code below. fix
- // that.
- CAFFE_ENFORCE(
- numGroups > 1 ||
- (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
- CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 4 == 0);
-
- CAFFE_ENFORCE(filterPixels == filterSize * filterSize);
- CAFFE_ENFORCE(hidActs->dim32(0) == numModules * numFilters);
- CAFFE_ENFORCE(
- filters->dim32(0) == filterModuleMult * numFilterColors * filterPixels);
- CAFFE_ENFORCE(numModules == numModulesY * numModulesX);
-
- // These routines don't handle the case when only part of the image is visited
- // in the convolution
- CAFFE_ENFORCE(paddingStart <= 0);
- CAFFE_ENFORCE(
- paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
- CAFFE_ENFORCE(
- paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
- CAFFE_ENFORCE(moduleStride <= filterSize);
-
- dim3 blocks;
- dim3 threads;
- int colorsPerThread, imgsPerThread;
- if (numFilterColors % 8 == 0) {
- threads = dim3(32, numFilterColors % 64 == 0 ? 8 : 4);
- colorsPerThread = numFilterColors % 64 == 0 ? 8
- : numFilterColors % 48 == 0
- ? 12
- : numFilterColors % 32 == 0 ? 8 : numFilterColors % 16 == 0 ? 4 : 2;
- imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
- CAFFE_ENFORCE(numFilterColors % (threads.y * colorsPerThread) == 0);
-
- blocks = dim3(
- DIVUP(numImages, threads.x * imgsPerThread) *
- (numImgColors / (threads.y * colorsPerThread)),
- imgPixels);
- // NOTE: the case when channels % 32 == 0 but channels % 48 != 0 and
- // channels % 64 != 0 has not been optimized!!
- } else if (numFilterColors > 3) {
- // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!!
- imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
- threads = dim3(16, 16);
- colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
- blocks = dim3(
- DIVUP(numImages, threads.x * imgsPerThread) *
- (numImgColors / colorsPerThread),
- DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4));
- } else {
- // NOTE: THIS CASE HAS NOT BEEN OPTIMIZED FOR KEPLER!!
- imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
- threads = dim3(16, 16);
- blocks = dim3(
- DIVUP(numImages, threads.x * imgsPerThread),
- DIVUP(imgSizeY, 4) * DIVUP(imgSizeX, 4));
- }
- bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
-
- if (scaleTargets == 0) { // do not scale or use targets matrix
- targets->Resize(std::vector<int>{numImgColors * imgPixels, numImages});
- } else {
- CAFFE_ENFORCE(targets->dim32(0) == numImgColors * imgPixels);
- CAFFE_ENFORCE(targets->dim32(1) == numImages);
- }
- const bool scale = scaleTargets != 0;
-
- float* hidacts_data = hidActs->mutable_data<float>();
- float* filters_data = filters->mutable_data<float>();
- float* targets_data = targets->mutable_data<float>();
-
- cudaStream_t stream = context->cuda_stream();
- // cudaFuncSetCacheConfig(conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- // 4, 32, 4, 12, 16, 16, false, false, true >, cudaFuncCachePreferShared);
- // conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16< 4, 32, 4,
- // 12, 16, 16, false, false, true ><<<blocks, threads, 0, stream>>>(
- // tex_hidacts, tex_filters, targets_data, numModulesY,
- // numModulesX, numImages, numFilters, filterSize, imgSizeY,
- // imgSizeX, paddingStart, moduleStride, numImgColors, numGroups,
- // scaleTargets, scaleOutput);
-
- // return;
- // printf("conv: %d\n", conv);
- // printf("scale: %d\n", scale);
- // printf("checkCaseBounds: %d\n", checkCaseBounds);
- // printf("numFilterColors: %d\n", numFilterColors);
- // printf("numImages: %d\n", numImages);
- // cudaStream_t stream = NVMatrix::getDefaultStream();
-
- if (conv == true) {
- if (scale == false) {
- if (checkCaseBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<8, 4, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<8, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<4, 4, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<4, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 3, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 3, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 3, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 3, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 1, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 1, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 1, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 1, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, true, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- }
- } else if (scale == true) {
- if (checkCaseBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<8, 4, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<8, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<4, 4, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<4, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 3, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 3, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 3, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 3, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 1, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 1, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 1, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 1, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, false, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, false, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- true,
- true><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, true, true>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, true, true>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, true, true>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- }
- }
- } else if (conv == false) {
- if (scale == false) {
- if (checkCaseBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<8, 4, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<8, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<4, 4, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<4, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 3, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 3, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 3, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 3, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 1, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 1, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 1, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 1, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- false,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, false, true, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, false, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, false, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, false, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, false, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- }
- } else if (scale == true) {
- if (checkCaseBounds == false) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex<
- 8,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaTextureObject_t tex_hidacts =
- GetTensorTextureObject(hidActs);
- cudaTextureObject_t tex_filters =
- GetTensorTextureObject(filters);
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_preloadfh_ty_4_tx_32_c_12_ff_16_fh_16<
- 4,
- 32,
- 4,
- 12,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_hidacts,
- tex_filters,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- checkCudaErrors(cudaDestroyTextureObject(tex_filters));
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 12,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 4,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 4,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 4,
- 2,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 2,
- 2,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- false,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<8, 4, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<8, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<4, 4, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<4, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 3, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 3, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 3, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 3, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 128 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<8, 1, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<8, 1, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 64 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<4, 1, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<4, 1, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 32 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- } else if (numImages % 16 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, false, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, false, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors % 8 == 0) {
- if (numFilterColors % 64 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 8,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 12,
- 16,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFilters % 32 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 32,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 8,
- 16,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 4,
- 16,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors % 8 == 0) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_img_acts_manycolor_kepler<
- 4,
- 32,
- 1,
- 2,
- 16,
- 16,
- true,
- true,
- false><<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors > 3) {
- if (numFilterColors == 4) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_mediumcolor<2, 4, true, true, false>,
- cudaFuncCachePreferShared);
- img_acts_mediumcolor<2, 4, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 3, true, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 3, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 2) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 2, true, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 2, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors == 1) {
- if (numFilters % 16 == 0) {
- if (numImages % 1 == 0) {
- cudaFuncSetCacheConfig(
- img_acts_color<2, 1, true, true, false>,
- cudaFuncCachePreferShared);
- img_acts_color<2, 1, true, true, false>
- <<<blocks, threads, 0, stream>>>(
- hidacts_data,
- filters_data,
- targets_data,
- numModulesY,
- numModulesX,
- numImages,
- numFilters,
- filterSize,
- imgSizeY,
- imgSizeX,
- paddingStart,
- moduleStride,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- }
- }
- }
-
- getLastCudaError("imgActs: kernel execution failed");
-}
-
-void convImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups) {
- _imgActs(
- context,
- hidActs,
- filters,
- targets,
- imgSizeY,
- imgSizeX,
- numModulesY,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 0,
- 1,
- true);
-}
-
-void convImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput) {
- _imgActs(
- context,
- hidActs,
- filters,
- targets,
- imgSizeY,
- imgSizeX,
- numModulesY,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- true);
-}
-
-void localImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups) {
- _imgActs(
- context,
- hidActs,
- filters,
- targets,
- imgSizeY,
- imgSizeX,
- numModulesY,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 0,
- 1,
- false);
-}
-
-void localImgActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* filters,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int imgSizeX,
- int numModulesY,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput) {
- _imgActs(
- context,
- hidActs,
- filters,
- targets,
- imgSizeY,
- imgSizeX,
- numModulesY,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- scaleTargets,
- scaleOutput,
- false);
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include "../include/cudaconv2.cuh"
-
-#define LO16(x) ((x)&0x0000FFFF)
-#define HI16(x) ((x) >> 16)
-
-#define WA_LOOP(r) \
- _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
- _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \
- prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * \
- shHidActs[threadIdx.x + f * B_X][(r)]; \
- } \
- }
-
-#define WA_LOOP2(r) \
- _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \
- _Pragma("unroll") for (int c = 0; c < colorsPerThread; c++) { \
- prod[f][c] += shImages[threadIdx.y + c * B_Y][(r)] * \
- shHidActs[threadIdx.x + f * B_X][(r)]; \
- } \
- }
-
-#define WA_IMLOAD(r) \
- imPreload[r] = im[(r)*B_X * B_Y / preloadCases * imgPixels * imgStride];
-#define WA_IMLOAD_TX(r) \
- imPreload[r] = tex1Dfetch<float>( \
- images, \
- imgOffset2 + (r)*B_X * B_Y / preloadCases * imgPixels * imgStride);
-#define WA_HALOAD(r) \
- haPreload[r] = ha[(r)*B_X * B_Y / preloadCases * numImages * numModules];
-#define WA_HALOAD_TX(r) \
- haPreload[r] = tex1Dfetch<float>( \
- hidActs, \
- hidActsOffset2 + (r)*B_X * B_Y / preloadCases * numImages * numModules);
-
-__device__ __forceinline__ void
-conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- const int my,
- const int mx,
- const int paddingStart,
- const int numModulesX,
- const int moduleStride,
- const int blockPixelY,
- const int blockPixelX,
- const int imgSizeX,
- const int imgStride,
- int& pixIdx,
- int& m) {
- const int imgLoadModPosY = paddingStart + my * moduleStride;
- const int imgLoadModPosX = paddingStart + mx * moduleStride;
- const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
- const int pxX = imgLoadModPosX + blockPixelX;
- pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
- m = my * numModulesX + mx;
-}
-
-/*
- * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
- * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
- *
- * Number of filters must be divisible by B_X * filtersPerThread
- * Number of images (cases) should be divisible by preloadCases if
- * checkCaseBounds is false.
- *
- * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels,
- * numFilters)
- *
- * B_Y * B_X should be divisible by preloadCases.
- * preloadCases one of 16, 32.
- * B_X one of 4, 8, 16, 32
- * B_Y arbitrary (satisfying divisibility constraints)
- * numModules must be divisible by partialSum
- * pixelsPerThread must be divisible by pixelCache
- *
- * After adding pixelsPerThread, register usage went from 20 to 23 (when
- * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
- * unable to optimize that case away.
- */
-template <
- int B_Y,
- int B_X,
- int pixelCache,
- int pixelsPerThread,
- int filtersPerThread,
- int preloadCases,
- int numColors,
- bool scale,
- bool checkCaseBounds>
-__global__ void conv_weight_acts_c_kepler(
- float* images,
- float* hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int partialSum,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[pixelCache * B_Y * numColors]
- [preloadCases]; // preload preloadCases cases of B_Y
- // * pixelsPerThread pixels
- __shared__ float
- shHidActs[B_X * filtersPerThread]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int filterBlocksPerModule = numFilters / (B_X * filtersPerThread);
- const int outputModuleIdx = blockIdx.x / filterBlocksPerModule;
- const int moduleIdx = partialSum * outputModuleIdx;
- const int blockFilterIdx =
- B_X * filtersPerThread * (blockIdx.x % filterBlocksPerModule);
-
- // const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
- const int numModules = numModulesY * numModulesX;
-
- const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
-
- images += loadX;
- hidActs += blockFilterIdx * numImages * numModules +
- loadY * numImages * numModules + loadX;
-
- targets += (outputModuleIdx * numFilters) * filterPixels * numColors +
- blockPixelOffset * numFilters + blockFilterIdx +
- threadIdx.y * numFilters + threadIdx.x;
-
- float prod[numColors][pixelsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][p][f] = 0;
- }
- }
- }
-
- __shared__ int pxIdxes[B_Y * pixelsPerThread];
- //__shared__ bool isPxInImage[B_Y*pixelsPerThread];
- for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
- __syncthreads();
- if (tidx < B_Y * pixelsPerThread) {
- const int imgLoadModPosY =
- paddingStart + (m / numModulesX) * moduleStride;
- const int imgLoadModPosX =
- paddingStart + (m % numModulesX) * moduleStride;
- int pxY = (imgLoadModPosY + (blockPixelOffset + tidx) / filterSize);
- int pxX = (imgLoadModPosX + (blockPixelOffset + tidx) % filterSize);
- int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
- pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
- ? pixIdx
- : -1;
- // isPxInImage[tidx] = ;
- }
- __syncthreads();
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- if (/*loadY < B_X*filtersPerThread &&*/ (
- !checkCaseBounds || caseIdx + loadX < numImages)) {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_X * filtersPerThread) {
- shHidActs[loadY + y][loadX] =
- hidActs[caseIdx + y * numImages * numModules + m * numImages];
- }
- }
- }
-#pragma unroll
- for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) {
-// if (loadY < B_Y * pixelCache) { // This condition is not necessary for
-// correctness, but it speeds things a bit
-/*
- * As long as B_Y * B_X is divisible by preloadCases this will loop the right
- * number of times.
- *
- * This will load some imgGrads from filter pixels that don't exit (it'll set
- * those to 0), but the code does not produce any output for those pixels (see
- * last lines).
- */
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
- const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter
-
- if (pxIdx + blockPixelOffset < filterPixels &&
- (!checkCaseBounds || caseIdx + loadX < numImages)) {
- const int pixIdx =
- pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
-
- if (pixIdx >= 0) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] =
- images[caseIdx + c * imgPixels * imgStride + pixIdx];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- }
- }
- //}
-
- __syncthreads();
-
-#pragma unroll
- for (int i = 0; i < preloadCases; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int p = 0; p < pixelCache; p++) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- prod[c][pp + p][f] +=
- shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y][i] *
- shHidActs[threadIdx.x + f * B_X][i];
- }
- }
- }
- }
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleTargets *
- targets[p * B_Y * numFilters +
- c * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- } else {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- }
-}
-
-/*
- * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread
- colors and B_X * filtersPerThread filters
- * threadIdx.x determines filter
- * threadIdx.y determines color
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- partialSum
- * blockIdx.y determines color batch of B_Y * colorsPerThread
- * blockIdx.z determines pixel in filter
- * NOTE: blockIdx.z is limited to values < 2^16. This means that this
- routine will
- * fail for filters >= 256*256. I'm assuming I won't ever use
- such large filters.
-
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numFilterColors,
- filterPixels, numFilters)
-
- * B_X * B_Y must be divisible by preloadCases
- */
-template <
- int B_Y,
- int B_X,
- int filtersPerThread,
- int colorsPerThread,
- int preloadCases,
- bool scale>
-__global__ void conv_weight_acts_mc_mf_kepler(
- float* images,
- float* hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const int partialSum,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[colorsPerThread * B_Y]
- [preloadCases]; // preload preloadCases cases
- __shared__ float
- shHidActs[filtersPerThread * B_X]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
- const int outputModuleIdx = blockIdx.x / numFilterBlocks;
- const int moduleIdx = partialSum * outputModuleIdx;
- const int blockFilterIdx =
- filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
- const int numModules = numModulesY * numModulesX;
-
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
- const int numFilterColors = numImgColors / numGroups;
-
- const int blockPixelOffset = blockIdx.z; // pixel idx in filter
- const int blockPixelY = blockPixelOffset / filterSize,
- blockPixelX = blockPixelOffset % filterSize;
- const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
- const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
-
- images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
-
- hidActs += blockFilterIdx * numImages * numModules +
- loadY * numImages * numModules + loadX;
-
- targets += outputModuleIdx * numFilters * filterPixels * numFilterColors +
- (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
- blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
- // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
- float* shHidActLoad = &shHidActs[loadY][loadX];
- float* shImgLoad = &shImages[loadY][loadX];
- float prod[colorsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][f] = 0;
- }
- }
-
- for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
- const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
- const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
- const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
- const int pxX = imgLoadModPosX + blockPixelX;
- const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
- if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- // Checking this condition actually makes things faster ... :/
- // So I've removed the !checkCaseBounds flag and just check it all the
- // time.
- if (caseIdx + loadX < numImages) {
- /*
- * As long as B_Y * B_X is divisible by preloadCases this will loop
- * the right number of times.
- *
- * This will load some images from filter pixels that don't exist
- * (it'll set those to 0), but the code does not produce any output
- * for those pixels (see last lines).
- */
- if (loadY < B_Y * colorsPerThread) {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * colorsPerThread) {
- shImgLoad[(y)*preloadCases] =
- images[caseIdx + y * imgPixels * imgStride + pixIdx];
- }
- }
- }
-
- if (loadY < B_X * filtersPerThread) {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_X * filtersPerThread) {
- shHidActLoad[y * (preloadCases + 1)] = hidActs
- [caseIdx + y * numImages * numModules + m * numImages];
- }
- }
- }
- } else {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * colorsPerThread) {
- shImgLoad[(y)*preloadCases] = 0;
- }
- }
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_X * filtersPerThread) {
- shHidActLoad[y * (preloadCases + 1)] = 0;
- }
- }
- }
-
- __syncthreads();
-#pragma unroll
- for (int i = 0; i < preloadCases; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[c][f] += shImages[threadIdx.y + c * B_Y][i] *
- shHidActs[threadIdx.x + f * B_X][i];
- }
- }
- }
- __syncthreads();
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
- targets[c * B_Y * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][f];
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] =
- scaleOutputs * prod[c][f];
- }
- }
- }
-}
-
-/*
- * Each block computes weight gradients for 1 pixel, B_Y * colorsPerThread
- colors and B_X * filtersPerThread filters
- * threadIdx.x determines filter
- * threadIdx.y determines color
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- partialSum
- * blockIdx.y determines color batch of B_Y * colorsPerThread
- * blockIdx.z determines pixel in filter
- * NOTE: blockIdx.z is limited to values < 2^16. This means that this
- routine will
- * fail for filters >= 256*256. I'm assuming I won't ever use
- such large filters.
-
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numFilterColors,
- filterPixels, numFilters)
-
- * B_X * B_Y must be divisible by preloadCases
- */
-template <
- int B_Y,
- int B_X,
- int filtersPerThread,
- int colorsPerThread,
- int preloadCases,
- bool scale>
-__global__ void conv_weight_acts_mc_mf_kepler_sw(
- float* images,
- float* hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[colorsPerThread * B_Y]
- [preloadCases]; // preload preloadCases cases
- __shared__ float
- shHidActs[filtersPerThread * B_X]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- const int blockFilterIdx =
- filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
- const int numModules = numModulesY * numModulesX;
-
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
- const int numFilterColors = numImgColors / numGroups;
-
- const int blockPixelOffset = blockIdx.z; // pixel idx in filter
- const int blockPixelY = blockPixelOffset / filterSize,
- blockPixelX = blockPixelOffset % filterSize;
- const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
- const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
-
- images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
-
- hidActs += blockFilterIdx * numImages * numModules +
- loadY * numImages * numModules + loadX;
-
- targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
- (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
- blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
- // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
-
- const int mStartX =
- max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
- const int mStartY =
- max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
- const int mEndX =
- min(numModulesX,
- min(blockModuleStartX + sumWidth,
- DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
- const int mEndY =
- min(numModulesY,
- min(blockModuleStartY + sumWidth,
- DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
-
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
-
- float* shHidActLoad = &shHidActs[loadY][loadX];
- float* shImgLoad = &shImages[loadY][loadX];
- float prod[colorsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][f] = 0;
- }
- }
-
- /*
- * Note; iterating this way is about 1% slower and uses a few more registers
- * than iterating over the modules linearly. But it's consistent with the
- * preload routines, so I'm using it.
- */
- for (int my = mStartY; my < mEndY; my++) {
- const int imgLoadModPosY = paddingStart + my * moduleStride;
- const int pxY = imgLoadModPosY + blockPixelY; // pixel x,y coords in image
- for (int mx = mStartX; mx < mEndX; mx++) {
- const int m = my * numModulesX + mx;
- const int imgLoadModPosX = paddingStart + mx * moduleStride;
- const int pxX = imgLoadModPosX + blockPixelX;
- const int pixIdx =
- (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- // Checking this condition actually makes things faster ... :/
- // So I've removed the !checkCaseBounds flag and just check it all the
- // time.
- if (caseIdx + loadX < numImages) {
- /*
- * As long as B_Y * B_X is divisible by preloadCases this will loop
- * the right number of times.
- *
- * This will load some images from filter pixels that don't exist
- * (it'll set those to 0), but the code does not produce any output
- * for those pixels (see last lines).
- */
- if (loadY < B_Y * colorsPerThread) {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * colorsPerThread) {
- shImgLoad[(y)*preloadCases] =
- images[caseIdx + y * imgPixels * imgStride + pixIdx];
- }
- }
- }
-
- if (loadY < B_X * filtersPerThread) {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_X * filtersPerThread) {
- shHidActLoad[y * (preloadCases + 1)] = hidActs
- [caseIdx + y * numImages * numModules + m * numImages];
- }
- }
- }
- } else {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * colorsPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * colorsPerThread) {
- shImgLoad[(y)*preloadCases] = 0;
- }
- }
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_X * filtersPerThread) {
- shHidActLoad[y * (preloadCases + 1)] = 0;
- }
- }
- }
-
- __syncthreads();
-#pragma unroll
- for (int i = 0; i < preloadCases; i++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[c][f] += shImages[threadIdx.y + c * B_Y][i] *
- shHidActs[threadIdx.x + f * B_X][i];
- }
- }
- }
- __syncthreads();
- }
- }
- }
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
- targets[c * B_Y * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][f];
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] =
- scaleOutputs * prod[c][f];
- }
- }
- }
-}
-
-/*
- * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
- * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
- *
- * Number of filters must be divisible by B_X * filtersPerThread
- * Number of images (cases) should be divisible by preloadCases if
- * checkCaseBounds is false.
- *
- * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels,
- * numFilters)
- *
- * B_Y * B_X should be divisible by preloadCases.
- * preloadCases one of 16, 32.
- * B_X one of 4, 8, 16, 32
- * B_Y arbitrary (satisfying divisibility constraints)
- * numModules must be divisible by partialSum
- * pixelsPerThread must be divisible by pixelCache
- *
- * After adding pixelsPerThread, register usage went from 20 to 23 (when
- * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
- * unable to optimize that case away.
- */
-template <
- int B_Y,
- int B_X,
- int pixelCache,
- int pixelsPerThread,
- int filtersPerThread,
- int preloadCases,
- int numColors,
- bool scale,
- bool checkCaseBounds>
-__global__ void conv_weight_acts_c_kepler_sw(
- float* images,
- float* hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[pixelCache * B_Y * numColors]
- [preloadCases]; // preload preloadCases cases of B_Y
- // * pixelsPerThread pixels
- __shared__ float
- shHidActs[B_X * filtersPerThread]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
-
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- const int blockFilterIdx =
- B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
-
- // const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
- const int numModules = numModulesY * numModulesX;
-
- const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
-
- images += loadX;
- hidActs += blockFilterIdx * numImages * numModules
- // + loadY * numImages * numModules
- + loadX;
-
- targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
- blockPixelOffset * numFilters + blockFilterIdx +
- threadIdx.y * numFilters + threadIdx.x;
-
- // float* shImgLoad = &shImages[loadY][loadX];
- // float* shHidActLoad = &shHidActs[loadY][loadX];
-
- float prod[numColors][pixelsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][p][f] = 0;
- }
- }
- }
- const int mStartX = blockModuleStartX;
- const int mStartY = blockModuleStartY;
- const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
- const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
-
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
-
- const int fYOff = (blockPixelOffset + tidx) / filterSize;
- const int fXOff = (blockPixelOffset + tidx) % filterSize;
- __shared__ int pxIdxes[B_Y * pixelsPerThread];
- for (int my = mStartY; my < mEndY; my++) {
- const int imgLoadModPosY = paddingStart + my * moduleStride;
- for (int mx = mStartX; mx < mEndX; mx++) {
- const int m = my * numModulesX + mx;
-
- __syncthreads();
- const int imgLoadModPosX = paddingStart + mx * moduleStride;
- if (tidx < B_Y * pixelsPerThread) {
- // const int imgLoadModPosY = paddingStart + my *
- // moduleStride; const int imgLoadModPosX = paddingStart
- // + mx * moduleStride;
- int pxY = (imgLoadModPosY + fYOff);
- int pxX = (imgLoadModPosX + fXOff);
- int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
- pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
- ? pixIdx
- : -1;
- }
- __syncthreads();
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- if (/*loadY < B_X*filtersPerThread &&*/ (
- !checkCaseBounds || caseIdx + loadX < numImages)) {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- const int fIdx = ((loadY + y) % filtersPerThread) * B_X +
- (loadY + y) / filtersPerThread;
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- loadY + y < B_X * filtersPerThread) {
- shHidActs[loadY + y][loadX] = hidActs
- [caseIdx + fIdx * numImages * numModules + m * numImages];
- }
- }
- } else {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // const int fIdx = ((loadY + y) %
- // filtersPerThread) * B_X + (loadY + y) /
- // filtersPerThread;
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 ||
- loadY + y < B_X * filtersPerThread) {
- shHidActs[loadY + y][loadX] = 0;
- }
- }
- }
-#pragma unroll
- for (int pp = 0; pp < pixelsPerThread; pp += pixelCache) {
-// if (loadY < B_Y * pixelCache) { // This condition is not necessary for
-// correctness, but it speeds things a bit
-/*
- * As long as B_Y * B_X is divisible by preloadCases this will loop the right
- * number of times.
- *
- * This will load some imgGrads from filter pixels that don't exit (it'll set
- * those to 0), but the code does not produce any output for those pixels (see
- * last lines).
- */
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache;
- y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
- const int pxIdx = pp * B_Y + loadY + y; // pixel idx in filter
-
- if (pxIdx + blockPixelOffset < filterPixels &&
- (!checkCaseBounds || caseIdx + loadX < numImages)) {
- const int pixIdx =
- pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
-
- if (pixIdx >= 0) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] =
- images[caseIdx + c * imgPixels * imgStride + pixIdx];
- }
- } else {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- }
- }
- //}
-
- __syncthreads();
-
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int i = 0; i < preloadCases; i++) {
-#pragma unroll
- for (int p = 0; p < pixelCache; p++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][pp + p][f] +=
- shImages[threadIdx.y + p * B_Y + c * pixelCache * B_Y]
- [i] *
- shHidActs[threadIdx.x * filtersPerThread + f][i];
- }
- }
- }
- }
-
- __syncthreads();
- }
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleTargets *
- targets[p * B_Y * numFilters +
- c * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- } else {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- }
-}
-
-#define WA_C3_LOOP(pp, c) \
- _Pragma("unroll") for (int i = 0; i < preloadCases; i++) { \
- _Pragma("unroll") for (int p = 0; p < pixelCache; p++) { \
- _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \
- prod[c][(pp) + p][f] += \
- shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \
- shHidActs[threadIdx.x * filtersPerThread + f][i]; \
- } \
- } \
- }
-
-#define WA_C3_LOOP2(pp) \
- _Pragma("unroll") for (int p = 0; p < pixelCache; p++) { \
- _Pragma("unroll") for (int i = 0; i < preloadCases; i++) { \
- _Pragma("unroll") for (int f = 0; f < filtersPerThread; f++) { \
- _Pragma("unroll") for (int c = 0; c < 3; ++c) { \
- prod[c][(pp) + p][f] += \
- shImages[threadIdx.y + p * B_Y + (c)*pixelCache * B_Y][i] * \
- shHidActs[threadIdx.x * filtersPerThread + f][i]; \
- } \
- } \
- } \
- }
-
-#define WA_3_FIDX(y) \
- (((loadY + (y)*B_X * B_Y / preloadCases) % filtersPerThread) * B_X + \
- (loadY + (y)*B_X * B_Y / preloadCases) / filtersPerThread)
-
-/*
- * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
- * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
- *
- * Number of filters must be divisible by B_X * filtersPerThread
- * Number of images (cases) should be divisible by preloadCases if
- * checkCaseBounds is false.
- *
- * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels,
- * numFilters)
- *
- * B_Y * B_X should be divisible by preloadCases.
- * preloadCases one of 16, 32.
- * B_X one of 4, 8, 16, 32
- * B_Y arbitrary (satisfying divisibility constraints)
- * numModules must be divisible by partialSum
- * pixelsPerThread must be divisible by pixelCache
- *
- * After adding pixelsPerThread, register usage went from 20 to 23 (when
- * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
- * unable to optimize that case away.
- */
-template <
- int B_Y,
- int B_X,
- int pixelCache,
- int pixelsPerThread,
- int filtersPerThread,
- int preloadCases,
- int numColors,
- bool scale,
- bool checkCaseBounds>
-//__launch_bounds__(256,2)
-__global__ void conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3(
- cudaTextureObject_t images,
- cudaTextureObject_t hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[pixelCache * B_Y * numColors]
- [preloadCases]; // preload preloadCases cases of B_Y
- // * pixelsPerThread pixels
- __shared__ float
- shHidActs[B_X * filtersPerThread]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
-
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- const int blockFilterIdx =
- B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
-
- // const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
- const int numModules = numModulesY * numModulesX;
-
- const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
- const int imgOffset = loadX;
- const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX;
- // images += loadX;
- // hidActs += blockFilterIdx * numImages * numModules
- // + loadX;
-
- targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
- blockPixelOffset * numFilters + blockFilterIdx +
- threadIdx.y * numFilters + threadIdx.x;
-
- // float* shImgLoad = &shImages[loadY][loadX];
- // float* shHidActLoad = &shHidActs[loadY][loadX];
-
- float prod[numColors][pixelsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][p][f] = 0;
- }
- }
- }
- const int mStartX = blockModuleStartX;
- const int mStartY = blockModuleStartY;
- const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
- const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
-
- const bool doWork = mStartY < mEndY && mStartX < mEndX;
- // if (!doWork) {
- // hidActs -=
- // }
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
-
- // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12]
- float haPreload[filtersPerThread * preloadCases / B_Y]; // [8]
- // if (blockIdx.x != 0 || blockIdx.y !=0) {
- // return;
- // }
- // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX,
- // mStartY, mEndX, mEndY);
- const int fYOff = (blockPixelOffset + tidx) / filterSize;
- const int fXOff = (blockPixelOffset + tidx) % filterSize;
- __shared__ int pxIdxes[B_Y * pixelsPerThread];
- // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [8]
-
- int m = mStartY * numModulesX + mStartX;
-
- int fidx[filtersPerThread * preloadCases / B_Y];
- if (doWork) {
-#pragma unroll
- for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) {
- const int fIdx = WA_3_FIDX(y);
- // if (doWork) {
- haPreload[y] = tex1Dfetch<float>(
- hidActs,
- hidActsOffset + fIdx * numImages * numModules + m * numImages);
- // }
- fidx[y] = fIdx * numImages * numModules;
- }
- }
-
- for (int my = mStartY; my < mEndY; my++) {
- const int imgLoadModPosY = paddingStart + my * moduleStride;
- for (int mx = mStartX; mx < mEndX; mx++) {
- m = my * numModulesX + mx;
-
- // __syncthreads();
- const int imgLoadModPosX = paddingStart + mx * moduleStride;
- if (tidx < B_Y * pixelsPerThread) {
- // const int imgLoadModPosY = paddingStart + my *
- // moduleStride; const int imgLoadModPosX = paddingStart
- // + mx * moduleStride;
- const int pxY = (imgLoadModPosY + fYOff);
- const int pxX = (imgLoadModPosX + fXOff);
- const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
- pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
- ? pixIdx
- : -1;
- }
- __syncthreads();
-
- int myNext = my, mxNext = mx, mNext = m;
- const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
-
- if (!lastModule) {
- mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
- myNext = my + (mx + 1 == mEndX);
- mNext = myNext * numModulesX + mxNext;
- }
-
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- const bool lastBatch = caseIdx + preloadCases == numImages;
- // const float* im = &images[caseIdx + preloadCases +
- // pixIdx]; const float* ha = &hidActs[caseIdx +
- // preloadCases + m * numImages];
- int hidActsOffset2 =
- hidActsOffset + caseIdx + preloadCases + m * numImages;
-
- if (lastBatch) {
- // ha = &hidActs[mNext * numImages];
- hidActsOffset2 = hidActsOffset + mNext * numImages;
- }
-
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- shHidActs[loadY + y][loadX] =
- haPreload[y * preloadCases / (B_X * B_Y)];
- }
-
-/* ==================================================================================
- * Iteration 0
- * ==================================================================================
- */
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter
- if (pxIdx + blockPixelOffset < filterPixels) {
- const int pixIdx =
- pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
- if (pixIdx >= 0) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] =
- tex1Dfetch<float>(
- images,
- imgOffset + caseIdx + c * imgPixels * imgStride +
- pixIdx);
- }
- }
- }
- }
-
- __syncthreads();
-
- haPreload[0] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[0]);
- haPreload[1] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[1]);
- WA_C3_LOOP(0, 0);
- haPreload[2] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[2]);
- haPreload[3] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[3]);
- WA_C3_LOOP(0, 1);
- haPreload[4] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[4]);
- haPreload[5] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[5]);
- WA_C3_LOOP(0, 2);
- haPreload[6] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[6]);
- haPreload[7] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[7]);
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleTargets *
- targets[p * B_Y * numFilters +
- c * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- } else {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- // if (threadIdx.x == 3)
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- }
-}
-
-/*
- * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X
- * filters threadIdx.x determines filter threadIdx.y determines pixel in filter
- *
- * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of
- * partialSum blockIdx.y determines pixel batch of B_Y * pixelsPerThread
- *
- * Number of filters must be divisible by B_X * filtersPerThread
- * Number of images (cases) should be divisible by preloadCases if
- * checkCaseBounds is false.
- *
- * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels,
- * numFilters)
- *
- * B_Y * B_X should be divisible by preloadCases.
- * preloadCases one of 16, 32.
- * B_X one of 4, 8, 16, 32
- * B_Y arbitrary (satisfying divisibility constraints)
- * numModules must be divisible by partialSum
- * pixelsPerThread must be divisible by pixelCache
- *
- * After adding pixelsPerThread, register usage went from 20 to 23 (when
- * pixelsPerThread = 1)... so the compiler is messing up here somehow. It's
- * unable to optimize that case away.
- */
-template <
- int B_Y,
- int B_X,
- int pixelCache,
- int pixelsPerThread,
- int filtersPerThread,
- int preloadCases,
- int numColors,
- bool scale,
- bool checkCaseBounds>
-__launch_bounds__(256, 2) __global__
- void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(
- cudaTextureObject_t images,
- cudaTextureObject_t hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[pixelCache * B_Y * numColors]
- [preloadCases]; // preload preloadCases cases of B_Y
- // * pixelsPerThread pixels
- __shared__ float
- shHidActs[B_X * filtersPerThread]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidActs
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
-
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- const int blockFilterIdx =
- B_X * filtersPerThread * (blockIdx.x % numFilterBlocks);
-
- // const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
- const int numModules = numModulesY * numModulesX;
-
- const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
- const int imgOffset = loadX;
- const int hidActsOffset = blockFilterIdx * numImages * numModules + loadX;
- // images += loadX;
- // hidActs += blockFilterIdx * numImages * numModules
- // + loadX;
-
- targets += (blockModuleChunkIdx * numFilters) * filterPixels * numColors +
- blockPixelOffset * numFilters + blockFilterIdx +
- threadIdx.y * numFilters + threadIdx.x;
-
- // float* shImgLoad = &shImages[loadY][loadX];
- // float* shHidActLoad = &shHidActs[loadY][loadX];
-
- float prod[numColors][pixelsPerThread][filtersPerThread];
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- prod[c][p][f] = 0;
- }
- }
- }
- const int mStartX = blockModuleStartX;
- const int mStartY = blockModuleStartY;
- const int mEndX = min(numModulesX, blockModuleStartX + sumWidth);
- const int mEndY = min(numModulesY, blockModuleStartY + sumWidth);
-
- const bool doWork = mStartY < mEndY && mStartX < mEndX;
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
-
- // float imPreload[pixelCache * numColors * preloadCases / B_X]; // [12]
- float haPreload[filtersPerThread * preloadCases / B_Y]; // [6]
- // if (blockIdx.x != 0 || blockIdx.y !=0) {
- // return;
- // }
- // printf("mStartX: %d, mStartX: %d, mStartX: %d, mStartX: %d\n", mStartX,
- // mStartY, mEndX, mEndY);
- const int fYOff = (blockPixelOffset + tidx) / filterSize;
- const int fXOff = (blockPixelOffset + tidx) % filterSize;
- __shared__ int pxIdxes[B_Y * pixelsPerThread];
- // __shared__ int fidx[filtersPerThread * preloadCases / B_Y]; // [6]
-
- int m = mStartY * numModulesX + mStartX;
- int fidx[filtersPerThread * preloadCases / B_Y];
- // if (doWork) {
-#pragma unroll
- for (int y = 0; y < filtersPerThread * preloadCases / B_Y; ++y) {
- fidx[y] = WA_3_FIDX(y) * numImages * numModules;
- if (doWork) { // Not actually necessary, I think
- haPreload[y] =
- tex1Dfetch<float>(hidActs, hidActsOffset + fidx[y] + m * numImages);
- }
- }
- // }
- int mNext = mStartY * numModulesX + mStartX;
- for (int my = mStartY; my < mEndY; my++) {
- // const int imgLoadModPosY = paddingStart + my * moduleStride;
- for (int mx = mStartX; mx < mEndX; mx++) {
- m = mNext; // my * numModulesX + mx;
-
- // __syncthreads();
- // const int imgLoadModPosX = paddingStart + mx * moduleStride;
- if (tidx < B_Y * pixelsPerThread) {
- const int imgLoadModPosY = paddingStart + my * moduleStride;
- const int imgLoadModPosX = paddingStart + mx * moduleStride;
- const int pxY = (imgLoadModPosY + fYOff);
- const int pxX = (imgLoadModPosX + fXOff);
- const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
- pxIdxes[tidx] = pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX
- ? pixIdx
- : -1;
- }
- __syncthreads();
-
- const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
- mNext = lastModule * m +
- !lastModule *
- ((my + (mx + 1 == mEndX)) * numModulesX +
- (mx + 1 == mEndX ? mStartX : mx + 1));
- // if (!lastModule) {
- // const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
- // const int myNext = my + (mx + 1 == mEndX);
- // mNext = myNext * numModulesX + mxNext;
- // }
-
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- const bool lastBatch = caseIdx + preloadCases == numImages;
- // const float* im = &images[caseIdx + preloadCases +
- // pixIdx]; const float* ha = hidActs + !lastBatch *
- // (caseIdx + preloadCases + m * numImages) + lastBatch *
- // mNext * numImages;
- const int hidActsOffset2 = hidActsOffset +
- !lastBatch * (caseIdx + preloadCases + m * numImages) +
- lastBatch * mNext * numImages;
- // if (lastBatch) {
- // ha = &hidActs[mNext * numImages];
- // }
-
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- shHidActs[loadY + y][loadX] =
- haPreload[y * preloadCases / (B_X * B_Y)];
- }
-
-/* ==================================================================================
- * Iteration 0
- * ==================================================================================
- */
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- }
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
- const int pxIdx = 0 * B_Y + loadY + y; // pixel idx in filter
- const int pixIdx =
- pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
- if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels &&
- (!checkCaseBounds || caseIdx + loadX < numImages)) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] =
- tex1Dfetch<float>(
- images,
- imgOffset + caseIdx + c * imgPixels * imgStride +
- pixIdx);
- }
- }
- }
- }
-
- __syncthreads();
-
- haPreload[0] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[0]);
- haPreload[1] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[1]);
- haPreload[2] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[2]);
- haPreload[3] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[3]);
- haPreload[4] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[4]);
- haPreload[5] = tex1Dfetch<float>(hidActs, hidActsOffset2 + fidx[5]);
-
- WA_C3_LOOP2(0);
-
- __syncthreads();
-
-/* ==================================================================================
- * Iteration 1
- * ==================================================================================
- */
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
- // const int pxIdx = 2 * B_Y + loadY + y; //
- // pixel idx in filter
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] = 0;
- }
- }
- }
-
-#pragma unroll
- for (int y = 0; y < B_Y * pixelCache; y += (B_X * B_Y) / preloadCases) {
- // Make sure number of rows in the array is divisible by number of
- // rows filled per iteration
- if ((B_Y * pixelCache) % (B_X * B_Y / preloadCases) == 0 ||
- y + loadY < B_Y * pixelCache) {
- const int pxIdx = 2 * B_Y + loadY + y; // pixel idx in filter
- const int pixIdx =
- pxIdxes[pxIdx]; //(pxY * imgSizeX + pxX) * imgStride;
- if (pixIdx >= 0 && pxIdx + blockPixelOffset < filterPixels &&
- (!checkCaseBounds || caseIdx + loadX < numImages)) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
- shImages[loadY + y + c * pixelCache * B_Y][loadX] =
- tex1Dfetch<float>(
- images,
- imgOffset + caseIdx + c * imgPixels * imgStride +
- pixIdx);
- }
- }
- }
- }
-
- __syncthreads();
-
- WA_C3_LOOP2(2);
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleTargets *
- targets[p * B_Y * numFilters +
- c * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- } else {
-#pragma unroll
- for (int p = 0; p < pixelsPerThread; p++) {
- if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
-#pragma unroll
- for (int c = 0; c < numColors; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets
- [p * B_Y * numFilters + c * filterPixels * numFilters +
- f * B_X] = scaleOutputs * prod[c][p][f];
- }
- }
- }
- }
- }
-}
-
-/*****************************Function Revision
- *Record***************************** Author: Tencent BestImage
- *Team(ankerguo@tencent.com) * Date: 2015-05-18 *
- * Reason: Optimizing kernel to get faster speed according to GPU features *
- * Method: *
- * 1. reorganizing data structure to avoid bank conflict; *
- * 2. using vectorized data type; *
- * 3. improving instruction-level parallelism; *
- * 4. removing redundant 'if' branches; *
- * 5. removing local variables to save registers. *
- *********************************************************************************/
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numFilterColors,
- * filterPixels, numFilters)
- */
-template <
- int B_Y,
- int B_X,
- int filtersPerThread,
- int colorsPerThread,
- int preloadCases,
- bool scale>
-__launch_bounds__(128, 4) __global__
- void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(
- cudaTextureObject_t images,
- cudaTextureObject_t hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- // avoid bank conflict by reorganizing the data structure, and improve the
- // band width by using 'float2' instead of 'float'
- __shared__ float2
- shImages[preloadCases]
- [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
- __shared__ float2 shHidActs[preloadCases]
- [filtersPerThread * B_X / 2 +
- 2]; // preload preloadCases cases of B_X hidacts
-
- const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
- const int tidx = B_X * ty + tx;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- // const int moduleIdx = partialSum * outputModuleIdx;
- const int blockFilterIdx =
- filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
- const int numModules = numModulesY * numModulesX;
-
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
- const int numFilterColors = numImgColors / numGroups;
-
- const int blockPixelOffset = blockIdx.z; // pixel idx in filter
- const int blockPixelY = blockPixelOffset / filterSize,
- blockPixelX = blockPixelOffset % filterSize;
- const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
- const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
- const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
- // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
- const int hidActsOffset = blockFilterIdx * numImages * numModules +
- loadY * numImages * numModules + loadX;
- //
- // hidActs +=
- // blockFilterIdx * numImages * numModules
- // + loadY * numImages * numModules
- // + loadX;
-
- targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
- (blockFilterColorIdx + ty) * filterPixels * numFilters +
- blockPixelOffset * numFilters + blockFilterIdx + tx;
- // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
-
- const int mStartX =
- max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
- const int mStartY =
- max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
- const int mEndX =
- min(numModulesX,
- min(blockModuleStartX + sumWidth,
- DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
- const int mEndY =
- min(numModulesY,
- min(blockModuleStartY + sumWidth,
- DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
-
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
- const bool doWork = mStartY < mEndY && mStartX < mEndX;
-
- // reduce 2 registers
- // float* shHidActLoad = &shHidActs[loadY][loadX];
- // float* shImgLoad = &shImages[loadY][loadX];
-
- float imPreload[preloadCases * colorsPerThread / B_X]; // [8]
- float haPreload[preloadCases * filtersPerThread / B_Y]; // [8]
-
- float prod[filtersPerThread][colorsPerThread];
-
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[f][c] = 0;
- }
- }
- int pixIdx, pixIdxNext, m, mNext;
-
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- mStartY,
- mStartX,
- paddingStart,
- numModulesX,
- moduleStride,
- blockPixelY,
- blockPixelX,
- imgSizeX,
- imgStride,
- pixIdx,
- m);
-
- if (doWork) {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // It's bizarre, but this is the fastest way I've found to get it not to
- // load nonexistent pixels. All other ways cause crazy excessive register
- // usage.
- const int idx = (mStartY < mEndY && mStartX < mEndX) *
- (0 + y * imgPixels * imgStride + pixIdx);
- imPreload[y * preloadCases / (B_X * B_Y)] =
- tex1Dfetch<float>(images, imgOffset + idx);
- }
- }
-
- if (doWork) {
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- // Almost certainly not necessary here.
- const int idx = (mStartY < mEndY && mStartX < mEndX) *
- (0 + y * numImages * numModules + m * numImages);
- haPreload[y * preloadCases / (B_X * B_Y)] =
- tex1Dfetch<float>(hidActs, hidActsOffset + idx);
- }
- }
-
- for (int my = mStartY; my < mEndY; my++) {
- for (int mx = mStartX; mx < mEndX; mx++) {
- int myNext = my, mxNext = mx;
- const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
-
- if (!lastModule) {
- mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
- myNext = my + (mx + 1 == mEndX);
- }
-
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- myNext,
- mxNext,
- paddingStart,
- numModulesX,
- moduleStride,
- blockPixelY,
- blockPixelX,
- imgSizeX,
- imgStride,
- pixIdxNext,
- mNext);
-
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
-// store the preloaded image's pixel into shared memory
-#pragma unroll
- for (int y = 0; y < 4; y++) {
- shImages[loadX][loadY + y * 8].x = imPreload[y];
- shImages[loadX][loadY + y * 8].y = imPreload[y + 4];
- }
- // const float* im = &images[caseIdx + preloadCases + pixIdx];
- // const float* ha = &hidActs[caseIdx + preloadCases + m * numImages];
- int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
- int hidActsOffset2 =
- hidActsOffset + caseIdx + preloadCases + m * numImages;
- if (caseIdx + preloadCases == numImages) {
- pixIdx = pixIdxNext;
- m = mNext;
- imgOffset2 = imgOffset + pixIdxNext;
- hidActsOffset2 = hidActsOffset + mNext * numImages;
- }
-
- // store the images and hidActs
- shHidActs[loadX][loadY].x = haPreload[0];
- shHidActs[loadX][loadY].y = haPreload[2];
- shHidActs[loadX][loadY + 16].x = haPreload[4];
- shHidActs[loadX][loadY + 16].y = haPreload[6];
- shHidActs[loadX][loadY + 8].x = haPreload[1];
- shHidActs[loadX][loadY + 8].y = haPreload[3];
- shHidActs[loadX][loadY + 24].x = haPreload[5];
- shHidActs[loadX][loadY + 24].y = haPreload[7];
-
-// preloade the image's and hidAct's pixel
-#pragma unroll
- for (int r = 0; r < 8; r++) {
- imPreload[r] = tex1Dfetch<float>(
- images, imgOffset2 + (r)*8 * imgPixels * imgStride);
- haPreload[r] = tex1Dfetch<float>(
- hidActs, hidActsOffset2 + (r)*8 * numImages * numModules);
- }
-
- __syncthreads();
-// put together the instructions of same type to improve instruction-level
-// parallelism
-#pragma unroll
- for (int r = 0; r < 16; r++) {
- for (int c = 0; c < 4; c++) {
- prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x;
- prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y;
- prod[2][c] +=
- shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x;
- prod[3][c] +=
- shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y;
- prod[0][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x;
- prod[1][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y;
- prod[2][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x;
- prod[3][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y;
- }
- }
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
- targets[c * B_Y * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[f][c];
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] =
- scaleOutputs * prod[f][c];
- }
- }
- }
-}
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numFilterColors,
- * filterPixels, numFilters)
- */
-template <
- int B_Y,
- int B_X,
- int filtersPerThread,
- int colorsPerThread,
- int preloadCases,
- bool scale>
-__launch_bounds__(256, 2) __global__
- void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(
- cudaTextureObject_t images,
- cudaTextureObject_t hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- __shared__ float shImages[colorsPerThread * B_Y]
- [preloadCases]; // preload preloadCases cases
- __shared__ float
- shHidActs[filtersPerThread * B_X]
- [preloadCases + 1]; // preload preloadCases cases of B_X hidacts
-
- const int tidx = B_X * threadIdx.y + threadIdx.x;
- const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- const int filterPixels = filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- // const int moduleIdx = partialSum * outputModuleIdx;
- const int blockFilterIdx =
- filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
- const int numModules = numModulesY * numModulesX;
-
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
- const int numFilterColors = numImgColors / numGroups;
-
- const int blockPixelOffset = blockIdx.z; // pixel idx in filter
- const int blockPixelY = blockPixelOffset / filterSize,
- blockPixelX = blockPixelOffset % filterSize;
- const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
- const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
-
- const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
- const int hidActsOffset = blockFilterIdx * numImages * numModules +
- loadY * numImages * numModules + loadX;
- // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
- //
- // hidActs +=
- // blockFilterIdx * numImages * numModules
- // + loadY * numImages * numModules
- // + loadX;
-
- targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors +
- (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters +
- blockPixelOffset * numFilters + blockFilterIdx + threadIdx.x;
- // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
-
- const int mStartX =
- max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
- const int mStartY =
- max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
- const int mEndX =
- min(numModulesX,
- min(blockModuleStartX + sumWidth,
- DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
- const int mEndY =
- min(numModulesY,
- min(blockModuleStartY + sumWidth,
- DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
-
- // if (mStartY == mEndY || mStartX == mEndX) {
- // return;
- // }
- const bool doWork = mStartY < mEndY && mStartX < mEndX;
-
- float* shHidActLoad = &shHidActs[loadY][loadX];
- float* shImgLoad = &shImages[loadY][loadX];
-
- float imPreload[preloadCases * colorsPerThread / B_X]; // [6]
- float haPreload[preloadCases * filtersPerThread / B_Y]; // [16]
-
- float prod[filtersPerThread][colorsPerThread];
-
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[f][c] = 0;
- }
- }
- int pixIdx, pixIdxNext, m, mNext;
-
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- mStartY,
- mStartX,
- paddingStart,
- numModulesX,
- moduleStride,
- blockPixelY,
- blockPixelX,
- imgSizeX,
- imgStride,
- pixIdx,
- m);
-
- if (doWork) {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- imPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(
- images, imgOffset + y * imgPixels * imgStride + pixIdx);
- }
-
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(
- hidActs, hidActsOffset + y * numImages * numModules + m * numImages);
- }
- }
- // if (mStartY > mEndY || mStartX > mEndX) {
- // printf("crzy!!\n");
- // }
-
- for (int my = mStartY; my < mEndY; my++) {
- for (int mx = mStartX; mx < mEndX; mx++) {
- int myNext = my, mxNext = mx;
- const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
-
- if (!lastModule) {
- mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
- myNext = my + (mx + 1 == mEndX);
- }
-
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- myNext,
- mxNext,
- paddingStart,
- numModulesX,
- moduleStride,
- blockPixelY,
- blockPixelX,
- imgSizeX,
- imgStride,
- pixIdxNext,
- mNext);
-
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
-#pragma unroll
- for (int y = 0; y < B_Y * colorsPerThread;
- y += (B_X * B_Y) / preloadCases) {
- shImgLoad[(y)*preloadCases] =
- imPreload[y * preloadCases / (B_X * B_Y)];
- }
-
-#pragma unroll
- for (int y = 0; y < B_X * filtersPerThread;
- y += (B_X * B_Y) / preloadCases) {
- shHidActLoad[y * (preloadCases + 1)] =
- haPreload[y * preloadCases / (B_X * B_Y)];
- }
-
- __syncthreads();
-
- // const float* im = &images[caseIdx + preloadCases +
- // pixIdx]; const float* ha = &hidActs[caseIdx +
- // preloadCases + m * numImages];
- int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
- int hidActsOffset2 =
- hidActsOffset + caseIdx + preloadCases + m * numImages;
- if (caseIdx + preloadCases == numImages) {
- pixIdx = pixIdxNext;
- m = mNext;
- imgOffset2 = imgOffset + pixIdxNext;
- hidActsOffset2 = hidActsOffset + mNext * numImages;
- }
-
- WA_LOOP(0);
- WA_LOOP(1);
- WA_LOOP(2);
- WA_LOOP(3);
- WA_LOOP(4);
-
- WA_LOOP(5);
- WA_IMLOAD_TX(0);
- WA_LOOP(6);
- WA_IMLOAD_TX(1);
- WA_LOOP(7);
- WA_IMLOAD_TX(2);
- WA_LOOP(8);
- WA_IMLOAD_TX(3);
- WA_LOOP(9);
- WA_IMLOAD_TX(4);
- WA_LOOP(10);
- WA_IMLOAD_TX(5);
-
- WA_LOOP(11);
- WA_HALOAD_TX(0);
- WA_LOOP(12);
- WA_HALOAD_TX(1);
- WA_LOOP(13);
- WA_HALOAD_TX(2);
- WA_LOOP(14);
- WA_HALOAD_TX(3);
- WA_LOOP(15);
- WA_HALOAD_TX(4);
- WA_LOOP(16);
- WA_HALOAD_TX(5);
- WA_LOOP(17);
- WA_HALOAD_TX(6);
- WA_LOOP(18);
- WA_HALOAD_TX(7);
- WA_LOOP(19);
- WA_HALOAD_TX(8);
- WA_LOOP(20);
- WA_HALOAD_TX(9);
- WA_LOOP(21);
- WA_HALOAD_TX(10);
- WA_LOOP(22);
- WA_HALOAD_TX(11);
- WA_LOOP(23);
- WA_HALOAD_TX(12);
- WA_LOOP(24);
- WA_HALOAD_TX(13);
- WA_LOOP(25);
- WA_HALOAD_TX(14);
- WA_LOOP(26);
- WA_HALOAD_TX(15);
-
- WA_LOOP(27);
- WA_LOOP(28);
- WA_LOOP(29);
- WA_LOOP(30);
- WA_LOOP(31);
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets *
- targets[c * B_Y * filterPixels * numFilters + f * B_X] +
- scaleOutputs * prod[f][c];
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixels * numFilters + f * B_X] =
- scaleOutputs * prod[f][c];
- }
- }
- }
-}
-
-/*****************************Function Revision
- *Record***************************** Author: Tencent BestImage
- *Team(ankerguo@tencent.com) * Date: 2015-05-18 *
- * Reason: Optimizing kernel to get faster speed according to GPU features *
- * Method: *
- * 1. reorganizing data structure to avoid bank conflict; *
- * 2. using vectorized data type; *
- * 3. improving instruction-level parallelism; *
- * 4. removing redundant 'if' branches; *
- * 5. removing local variables to save registers. *
- *********************************************************************************/
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModulesY, numModulesX, numImages)
- *
- * targets: (numModulesY*numModulesX/partialSum, numFilterColors,
- * filterPixels, numFilters)
- */
-template <
- int B_Y,
- int B_X,
- int filtersPerThread,
- int colorsPerThread,
- int preloadCases,
- bool scale>
-__launch_bounds__(256, 2) __global__
- void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(
- cudaTextureObject_t images,
- cudaTextureObject_t hidActs,
- float* targets,
- const int numImages,
- const int numFilters,
- const int numModulesY,
- const int numModulesX,
- const int imgSizeY,
- const int imgSizeX,
- const int filterSize,
- const int paddingStart,
- const int moduleStride,
- const int imgStride,
- const int numImgColors,
- const int numGroups,
- const int sumWidth,
- const float scaleTargets,
- const float scaleOutputs) {
- // avoid bank conflict by re-organizing the data structure, and improve band
- // width by using 'float2' instead of 'float'
- __shared__ float2
- shImages[preloadCases]
- [colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
- __shared__ float2 shHidActs[preloadCases]
- [filtersPerThread * B_X / 2 +
- 2]; // preload preloadCases cases of B_X hidacts
- const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
- // const int tidx = B_X * threadIdx.y + threadIdx.x;
- // reduce two registers
- // const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
- // const int filterPixels = filterSize * filterSize;
- // reduce one register
- const int filterPixelsAll = numFilters * filterSize * filterSize;
- const int imgPixels = imgSizeY * imgSizeX;
-
- const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
- const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
-
- const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
- // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
-
- const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
- const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
-
- const int blockModuleStartX = blockModuleChunkX * sumWidth;
- const int blockModuleStartY = blockModuleChunkY * sumWidth;
-
- // const int moduleIdx = partialSum * outputModuleIdx;
- const int blockFilterIdx =
- filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
- const int numModules = numModulesY * numModulesX;
-
- const int numFiltersPerGroup = numFilters / numGroups;
- const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
- const int numFilterColors = numImgColors / numGroups;
-
- const int blockPixelOffset = blockIdx.z; // pixel idx in filter
- const int blockPixelY = blockPixelOffset / filterSize,
- blockPixelX = blockPixelOffset % filterSize;
- const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread;
- const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
- const int imgOffset =
- (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride +
- (ty * B_X + tx) % preloadCases;
- // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
- const int hidActsOffset = blockFilterIdx * numImages * numModules +
- ((ty * B_X + tx) / preloadCases) * numImages * numModules +
- ((ty * B_X + tx) % preloadCases);
- //
- // hidActs +=
- // blockFilterIdx * numImages * numModules
- // + loadY * numImages * numModules
- // + loadX;
-
- // usie one temporary register instead of multiple registers
- const int pIdxBase = imgStride *
- ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX);
-
- targets += blockModuleChunkIdx * numFilters * filterSize * filterSize *
- numFilterColors +
- (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters +
- blockPixelOffset * numFilters + blockFilterIdx + tx;
- // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
-
- const int mStartX =
- max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
- const int mStartY =
- max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
- const int mEndX =
- min(numModulesX,
- min(blockModuleStartX + sumWidth,
- DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
- const int mEndY =
- min(numModulesY,
- min(blockModuleStartY + sumWidth,
- DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
-
- // reduce 3 registers
- const bool doWork = mStartY < mEndY && mStartX < mEndX;
-
- // float* shHidActLoad = &shHidActs[loadY][loadX];
- // float* shImgLoad = &shImages[loadY][loadX];
-
- float imPreload[preloadCases * colorsPerThread / B_X]; // [4]
- float haPreload[preloadCases * filtersPerThread / B_Y]; // [8]
-
- float prod[filtersPerThread][colorsPerThread];
-
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
- prod[f][c] = 0;
- }
- }
- // int pixIdx, pixIdxNext, m, mNext;
-
- // conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
- // mStartY, mStartX, paddingStart, numModulesX, moduleStride,
- // blockPixelY, blockPixelX, imgSizeX, imgStride,
- // pixIdx, m);
-
- const int pixIdx =
- pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride;
- const int m = (mStartY * numModulesX + mStartX);
-
- // preload the image's pixel
- if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
-#pragma unroll
- for (int i = 0; i < 4; i++) {
- imPreload[i] = tex1Dfetch<float>(
- images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx);
- }
- }
-
- // preload the hidAct's pixel
- if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) {
-#pragma unroll
- for (int i = 0; i < 8; i++) {
- haPreload[i] = tex1Dfetch<float>(
- hidActs,
- hidActsOffset + 16 * i * numImages * numModules + m * numImages);
- }
- }
-
- for (int my = mStartY; my < mEndY; my++) {
- for (int mx = mStartX; mx < mEndX; mx++) {
- for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
- int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase +
- (my * imgSizeX + mx) * moduleStride * imgStride;
- int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases +
- (my * numModulesX + mx) * numImages;
-
- if (caseIdx + preloadCases == numImages) {
- const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
- const int myNext = my + (mx + 1 == mEndX);
-
- imgOffset2 = imgOffset + +pIdxBase +
- (myNext * imgSizeX + mxNext) * moduleStride * imgStride;
- hidActsOffset2 =
- hidActsOffset + (myNext * numModulesX + mxNext) * numImages;
- }
-
- if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
- // store the previousely preloaded pixel into shared memory
- shImages[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases]
- .x = imPreload[0];
- shImages[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases]
- .y = imPreload[2];
- shImages[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 16]
- .x = imPreload[1];
- shImages[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 16]
- .y = imPreload[3];
- }
-
- if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) {
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases]
- .x = haPreload[0];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases]
- .y = haPreload[2];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 32]
- .x = haPreload[4];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 32]
- .y = haPreload[6];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 16]
- .x = haPreload[1];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 16]
- .y = haPreload[3];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 48]
- .x = haPreload[5];
- shHidActs[(ty * B_X + tx) % preloadCases]
- [(ty * B_X + tx) / preloadCases + 48]
- .y = haPreload[7];
- }
-
-#pragma unroll
- for (int r = 0; r < 8; r++) {
- haPreload[r] = tex1Dfetch<float>(
- hidActs, hidActsOffset2 + r * 16 * numImages * numModules);
- }
-
-#pragma unroll
- for (int r = 0; r < 4; r++) {
- imPreload[r] = tex1Dfetch<float>(
- images, imgOffset2 + r * 16 * imgPixels * imgStride);
- }
- __syncthreads();
-
-// put together the instructions of same type to improve instruction-level
-// parallelism calculate the derivative of the hidAct with respect to weight
-#pragma unroll
- for (int r = 0; r < 16; r++) {
-#pragma unroll
- for (int c = 0; c < 4; c++) {
- prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x;
- prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y;
- prod[2][c] +=
- shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x;
- prod[3][c] +=
- shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y;
- prod[0][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x;
- prod[1][c + 4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y;
- prod[2][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x;
- prod[3][c + 4] +=
- shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y;
- }
- }
-
- __syncthreads();
- }
- }
- }
-
- if (scale) {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixelsAll + f * B_X] =
- scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] +
- scaleOutputs * prod[f][c];
- }
- }
- } else {
-#pragma unroll
- for (int c = 0; c < colorsPerThread; c++) {
-#pragma unroll
- for (int f = 0; f < filtersPerThread; f++) {
- targets[c * B_Y * filterPixelsAll + f * B_X] =
- scaleOutputs * prod[f][c];
- }
- }
- }
-}
-
-std::pair<int, int> getWeightActsOutputSize(
- int numModulesY,
- int numModulesX,
- int numFilterColors,
- int filterSize,
- int numFilters,
- int sumWidth) {
- const int outputModuleChunksX = DIVUP(numModulesX, sumWidth);
- const int outputModuleChunksY = DIVUP(numModulesY, sumWidth);
- const int outputModuleChunks = outputModuleChunksX * outputModuleChunksY;
- return std::pair<int, int>(
- outputModuleChunks * numFilterColors * filterSize * filterSize,
- numFilters);
-}
-
-/*
- * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
- * hidActs: (numFilters, numModules, numImages)
- *
- * targets: (numModuleY*numModulesX/partialSum, numFilterColors,
- * filterPixels, numFilters)
- *
- * TODO: you can get a slight speed boost for local non-convolutional units by
- * writing special routines for partialSum = 1. But I dunno if the code
- * duplication is worth it...
- *
- * Note: all of these convolution routines are optimized for the case when
- * the number of images (i.e. the minibatch size) is a multiple of 128.
- * Other batch sizes will work, but but I made no attempt whatsoever
- * to make them work fast.
- */
-void _weightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- int sumWidth,
- float scaleTargets,
- float scaleOutput) {
- CAFFE_ENFORCE(images->ndim() == 2);
- CAFFE_ENFORCE(hidActs->ndim() == 2);
- CAFFE_ENFORCE(targets->ndim() == 2);
-
- int numFilterColors = numImgColors / numGroups;
- int imgStride = images->dim32(1);
- int numImages = images->dim32(1);
- int imgPixels = images->dim32(0) / numImgColors;
- int imgSizeX = imgPixels / imgSizeY;
- int numModules = numModulesY * numModulesX;
- int numFilters = hidActs->dim32(0) / numModules;
- int numFiltersPerGroup = numFilters / numGroups;
-
- CAFFE_ENFORCE(numImgColors % numGroups == 0);
- CAFFE_ENFORCE(numFilters % (16 * numGroups) == 0);
- CAFFE_ENFORCE(
- numGroups > 1 ||
- (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 16 == 0)));
- CAFFE_ENFORCE(numGroups == 1 || numFilterColors % 16 == 0);
- CAFFE_ENFORCE(imgSizeY * imgSizeX == imgPixels);
- CAFFE_ENFORCE(images->dim32(0) == imgPixels * numImgColors);
-
- int filterPixels = filterSize * filterSize;
- int outputModuleChunksX = DIVUP(numModulesX, sumWidth);
- int outputModuleChunksY = DIVUP(numModulesY, sumWidth);
- int outputModuleChunks = outputModuleChunksX * outputModuleChunksY;
- // partialSum = partialSum == 0 ? numModules : partialSum;
-
- // CAFFE_ENFORCE(numModules % partialSum == 0);
- CAFFE_ENFORCE(hidActs->dim32(1) == numImages);
-
- // These routines don't handle the case when only part of the image is visited
- // in the convolution
- CAFFE_ENFORCE(paddingStart <= 0);
- CAFFE_ENFORCE(
- paddingStart + (numModulesX - 1) * moduleStride + filterSize >= imgSizeX);
- CAFFE_ENFORCE(
- paddingStart + (numModulesY - 1) * moduleStride + filterSize >= imgSizeY);
- CAFFE_ENFORCE(moduleStride <= filterSize);
-
- CAFFE_ENFORCE(numModules * numFilters == hidActs->dim32(0));
-
- int preloadCases = 32;
-
- dim3 blocks, threads;
- int bx, by;
- int pixelsPerThread, filtersPerThread, colorsPerThread;
- // Worth playing with these parameters to find best values for your problem.
- // These values work relatively well, but not optimal for all problems.
- if (numFilterColors > 3) {
- filtersPerThread =
- numFiltersPerGroup % 64 == 0 ? 4 : numFiltersPerGroup % 32 == 0 ? 2 : 1;
- colorsPerThread = numFilterColors % 64 == 0
- ? 8
- : numFilterColors % 48 == 0 ? 6 : numFilterColors % 32 == 0 ? 8 : 4;
- by = (numFilterColors / colorsPerThread) % 8 == 0 ? 8 : 4;
- bx = numFiltersPerGroup % 128 == 0 ? 32 : 16;
- preloadCases = filtersPerThread * colorsPerThread < 32 ? 32 : 16;
- blocks = dim3(
- outputModuleChunks * (numFilters / (bx * filtersPerThread)),
- numFilterColors / (by * colorsPerThread),
- filterPixels);
- CAFFE_ENFORCE(numFilterColors % (by * colorsPerThread) == 0);
- } else { // This is ugly but it's nice to spell it out clearly
- CAFFE_ENFORCE(numGroups == 1); // Just for sanity
- // NOTE: these things are only optimized for colors = 3. I didn't really
- // test other cases.
- if (numFilters % 64 ==
- 0) { // TODO: having a separate case for 128 would make things faster,
- // but I probably don't care about 128
- filtersPerThread = 4;
- pixelsPerThread = 2;
- by = 16;
- bx = 16;
- preloadCases = 32;
- } else if (numFilters % 48 == 0) {
- filtersPerThread = 3;
- pixelsPerThread = 4;
- by = 16;
- bx = 16;
- preloadCases = 32;
- } else if (numFilters % 32 == 0) {
- filtersPerThread = 2;
- pixelsPerThread = 2;
- by = 8;
- bx = 16;
- preloadCases = 16;
- } else { // This case is completely untested. It might be really slow. But
- // no time now.
- filtersPerThread = 1;
- pixelsPerThread = 16;
- by = 16;
- bx = 16;
- preloadCases = 32;
- }
- blocks = dim3(
- outputModuleChunks * (numFilters / (bx * filtersPerThread)),
- DIVUP(filterPixels, by * pixelsPerThread));
- }
- CAFFE_ENFORCE((by * bx) % preloadCases == 0);
- CAFFE_ENFORCE(numFilters % (bx * filtersPerThread) == 0);
- threads = dim3(bx, by);
- bool checkCaseBounds = numImages % preloadCases != 0;
- bool scale = scaleTargets != 0;
- std::pair<int, int> targetSize = getWeightActsOutputSize(
- numModulesY,
- numModulesX,
- numFilterColors,
- filterSize,
- numFilters,
- sumWidth);
- if (!scale) {
- targets->Resize(std::vector<int>{targetSize.first, targetSize.second});
- } else {
- CAFFE_ENFORCE(targets->dim32(0) == targetSize.first);
- CAFFE_ENFORCE(targets->dim32(1) == targetSize.second);
- }
-
- float* images_data = images->mutable_data<float>();
- float* hidacts_data = hidActs->mutable_data<float>();
- float* targets_data = targets->mutable_data<float>();
- const std::size_t images_bytes = images->nbytes();
-
- cudaStream_t stream = context->cuda_stream();
-
- checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
-
- if (scale == false) {
- if (checkCaseBounds == false) {
- if (numFilterColors > 3) {
- if (numFilterColors % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
- 8,
- 32,
- 4,
- 8,
- 16,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
- 8,
- 32,
- 4,
- 8,
- 16,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
- 8,
- 16,
- 4,
- 8,
- 16,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
- 8,
- 16,
- 4,
- 8,
- 16,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
- 8,
- 32,
- 4,
- 6,
- 32,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
- 8,
- 32,
- 4,
- 6,
- 32,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- false,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 3,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 3,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 2) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 2,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 2,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 2,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 2,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 1) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 1,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 1,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 1,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 1,
- false,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors > 3) {
- if (numFilterColors % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 3,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 3,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 2) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 2,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 2,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 2,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 2,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 1) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 1,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 1,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 1,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 1,
- false,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, false, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- } else if (scale == true) {
- if (checkCaseBounds == false) {
- if (numFilterColors > 3) {
- if (numFilterColors % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
- 8,
- 32,
- 4,
- 8,
- 16,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16<
- 8,
- 32,
- 4,
- 8,
- 16,
- true><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
- 8,
- 16,
- 4,
- 8,
- 16,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16<
- 8,
- 16,
- 4,
- 8,
- 16,
- true><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
- 8,
- 32,
- 4,
- 6,
- 32,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32<
- 8,
- 32,
- 4,
- 6,
- 32,
- true><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_preload_pc_2_pt_2_f_4_r_32_c_3<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaTextureObject_t tex_images = GetTensorTextureObject(images);
- cudaTextureObject_t tex_hidacts = GetTensorTextureObject(hidActs);
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- true,
- false><<<blocks, threads, 0, stream>>>(
- tex_images,
- tex_hidacts,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- checkCudaErrors(cudaDestroyTextureObject(tex_images));
- checkCudaErrors(cudaDestroyTextureObject(tex_hidacts));
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 3,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 3,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 2) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 2,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 2,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 2,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 2,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 1) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 1,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 1,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 8,
- 16,
- 2,
- 2,
- 2,
- 16,
- 1,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 1,
- true,
- false>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, false>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- } else if (checkCaseBounds == true) {
- if (numFilterColors > 3) {
- if (numFilterColors % 64 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 48 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 32, 4, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 4, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 2, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<8, 16, 1, 6, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 32 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 8, 16, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 8, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors % 16 == 0) {
- if (numFiltersPerGroup % 128 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 32, 4, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 4, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 2, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_mc_mf_kepler_sw<4, 16, 1, 4, 32, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- numImgColors,
- numGroups,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- } else if (numFilterColors <= 3) {
- if (numFilterColors == 3) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 3,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 3, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 3,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 3, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 3, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 3,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 3, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 2) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 2,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 2, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 2,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 2, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 2, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 2,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 2, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- } else if (numFilterColors == 1) {
- if (numFiltersPerGroup % 64 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 2,
- 4,
- 32,
- 1,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 2, 4, 32, 1, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 48 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 4,
- 3,
- 32,
- 1,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 4, 3, 32, 1, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 32 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<8, 16, 2, 2, 2, 16, 1, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- } else if (numFiltersPerGroup % 16 == 0) {
- cudaFuncSetCacheConfig(
- conv_weight_acts_c_kepler_sw<
- 16,
- 16,
- 2,
- 16,
- 1,
- 32,
- 1,
- true,
- true>,
- cudaFuncCachePreferShared);
- conv_weight_acts_c_kepler_sw<16, 16, 2, 16, 1, 32, 1, true, true>
- <<<blocks, threads, 0, stream>>>(
- images_data,
- hidacts_data,
- targets_data,
- numImages,
- numFilters,
- numModulesY,
- numModulesX,
- imgSizeY,
- imgSizeX,
- filterSize,
- paddingStart,
- moduleStride,
- imgStride,
- sumWidth,
- scaleTargets,
- scaleOutput);
- }
- }
- }
- }
- }
- checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
- getLastCudaError("weightActs: kernel execution failed");
-}
-
-void convWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- int partialSum) {
- _weightActs(
- context,
- images,
- hidActs,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- filterSize,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- partialSum,
- 0,
- 1);
-}
-
-void convWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- int partialSum,
- float scaleTargets,
- float scaleOutput) {
- _weightActs(
- context,
- images,
- hidActs,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- filterSize,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- partialSum,
- scaleTargets,
- scaleOutput);
-}
-
-void localWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups) {
- _weightActs(
- context,
- images,
- hidActs,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- filterSize,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 1,
- 0,
- 1);
-}
-
-void localWeightActs(
- caffe2::CUDAContext* context,
- caffe2::TensorCUDA* images,
- caffe2::TensorCUDA* hidActs,
- caffe2::TensorCUDA* targets,
- int imgSizeY,
- int numModulesY,
- int numModulesX,
- int filterSize,
- int paddingStart,
- int moduleStride,
- int numImgColors,
- int numGroups,
- float scaleTargets,
- float scaleOutput) {
- _weightActs(
- context,
- images,
- hidActs,
- targets,
- imgSizeY,
- numModulesY,
- numModulesX,
- filterSize,
- paddingStart,
- moduleStride,
- numImgColors,
- numGroups,
- 1,
- scaleTargets,
- scaleOutput);
-}
+++ /dev/null
-################################################################################
-#
-# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
-#
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users. This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
-#
-################################################################################
-
-# Location of the CUDA Toolkit binaries and libraries
-CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
-CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
-CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
-
-# Common binaries
-NVCC = $(CUDA_BIN_PATH)/nvcc
-GCC = g++
-AR = ar
-
-# CUDA code generation flags
-GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
-GENCODE_FLAGS := $(GENCODE_SM35)
-
-LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
-CCFLAGS := -m64
-NVCCFLAGS := -m64
-
-# Debug build flags
-ifeq ($(dbg),1)
- CCFLAGS += -g
- NVCCFLAGS += -g -G
- DBG := debug
-else
- DBG := release
- NVCCFLAGS += -O3
- CCFLAGS += -O3
-endif
-
-# Add profiler output
-ifeq ($(prof),1)
- NVCCFLAGS += --ptxas-options=-v
-endif
-
-TARGETDIR := ./bin/$(DBG)
-OBJDIR := ./obj/$(DBG)
-
-########## USER STUFF ###########
-PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
-MODELNAME := _ConvNet
-LDFLAGS += -lpthread -ljpeg -lpython$(PYTHON_VERSION) -L../util -lutilpy -L../nvmatrix -lnvmatrix -L../cudaconv3 -lcudaconv -lcublas -Wl,-rpath=./util -Wl,-rpath=./nvmatrix -Wl,-rpath=./cudaconv3
-INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH)
-
-DEFINES := -DNUMPY_INTERFACE
-
-CUFILES := $(shell find . -name "*.cu")
-CU_DEPS := $(shell find . -name "*.cuh")
-CCFILES := $(shell find . -name "*.cpp")
-C_DEPS := $(shell find . -name "*.h")
-
-NVCCFLAGS += --compiler-options '-fPIC'
-LDFLAGS += -shared
-CCFLAGS += -fPIC
-TARGET := $(TARGETDIR)/$(MODELNAME).so
-
-################################################################################
-# Set up target and object files
-################################################################################
-OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
-OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
-OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
-
-# Target rules
-all: makedirs $(TARGET)
-
-$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
- $(NVCC) $(DEFINES) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
-
-$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
- $(GCC) $(DEFINES) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
-
-$(TARGET): $(OBJS)
- $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS) $(EXTRA_LDFLAGS)
- ln -sf $(TARGET) .
-
-makedirs:
- mkdir -p $(TARGETDIR)
- mkdir -p $(OBJDIR)/src
-
-clean:
- rm -rf ./obj
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ACTBROADCASTER_CUH_H_
-#define ACTBROADCASTER_CUH_H_
-
-#include <map>
-#include "streambroadcast.cuh"
-#include "copypipeline.cuh"
-
-class BroadcastMessage {
-public:
- enum MESSAGE_TYPE {
- BROADCAST,
- EXIT
- };
-protected:
- int _srcDevice;
- std::map<int, NVMatrix*> _mats;
- int _userIdx;
- Queue<int>* _finishQueue;
- MESSAGE_TYPE _type;
- BroadcastMessage(MESSAGE_TYPE type);
-public:
- BroadcastMessage(std::map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue);
-
- int getSrcDevice();
- std::map<int, NVMatrix*>& getMatrices();
- int getUserIdx();
- Queue<int>& getFinishQueue();
- MESSAGE_TYPE getMessageType();
-};
-
-class ExitBroadcastMessage : public BroadcastMessage {
-public:
- ExitBroadcastMessage();
-};
-
-class ActBroadcaster : public Thread {
-protected:
- std::map<int,IBroadcastNetwork*> _broadcasters; // src device --> broadcaster
- Queue<BroadcastMessage*> _messageQueue;
- int _numUsers;
-public:
- ActBroadcaster(int numUsers, intv& cpus);
- ~ActBroadcaster();
- Queue<BroadcastMessage*>& getMessageQueue();
- virtual void* run();
- void stop();
-};
-
-
-#endif /* ACTBROADCASTER_CUH_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CONVNET3
-#define CONVNET3
-
-#include <vector>
-#include <string>
-#include <set>
-#include <map>
-#include <helper_cuda.h>
-#include <time.h>
-#include "../../util/include/queue.h"
-#include "../../util/include/thread.h"
-#include <math.h>
-#include "../../util/include/sync.h"
-#include "messages.cuh"
-#include "streambroadcast.cuh"
-
-#include "layer.cuh"
-#include "data.cuh"
-#include "worker.cuh"
-#include "weights.cuh"
-#include "pipedispenser.cuh"
-#include "timer.cuh"
-
-class Worker;
-class WorkResult;
-class Layer;
-class DataLayer;
-class CostLayer;
-class ConvNetThread;
-class StreamBroadcast;
-class Weights;
-
-// name -> device id -> layer*
-typedef std::map<std::string,std::map<int, Layer*> > NameReplicaLayerMap;
-typedef std::map<std::string, Layer*> NameLayerMap;
-// name -> ReplicaMap
-//typedef std::map<int,NameLayerMap> ReplicaNameLayerMap;
-typedef std::vector<ConvNetThread*> ConvNetThreadV;
-typedef std::vector<DataLayer*> DataLayerVector;
-//typedef std::map<int,ConvNetThreadV> ReplicaThreadsMap;
-
-class ConvNet : public Thread {
-private:
- void checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights);
-protected:
- NameReplicaLayerMap _layerMap;
- DataLayerVector _dataLayers;
- // Vector of convnet threads (one thread == one GPU)
- ConvNetThreadV _convNetThreads;
-
- DataProvider* _dp;
- CPUData* _data, *_bufferData;
- int _bufferMinibatchIdx, _bufferPassIdx;
- ThreadSynchronizer* _sync;
- intv _deviceIDs;
-
- Queue<Worker*> _workerQueue;
- Queue<WorkResult*> _resultQueue;
- Queue<Message*> _msgQueue;
-
- int _numFwdTerminal;
- std::map<int, int> _numBwdTerminal; // pass idx -> #terminal
- int _totalPassesDone;
- int _numReplicasMin, _numReplicasMax;
- // For gradient checking
- int _numFailures;
- int _numTests;
-
- // Training progress (between 0 and 1).
- // Used to determine learning rate based on ParameterSchedule.
- double _trainingProgress;
- double _baseErr;
- bool _conserveMem;
- PipeDispenser *_dataCopyPD;
-
- void waitForTerminals(int numMsgs, MESSAGES msg);
- void sendMessage(MESSAGES msg, bool sync);
- void sendMessage(Message* msg, bool sync);
- void findBwdTerminal(Layer& l, std::set<Layer*>& visited, int& terminal, int passIdx);
- void connectReplicas();
- void initDataLayers(PyObjectV* layerList);
- void initGPUThreads(PyObjectV* layerList);
- void connectChildren(PyObject* layerParams);
- void* run();
- void setData(CPUData& data, int passIdx);
- void setDataFromBuffer();
- void setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx);
-public:
- ConvNet(PyObject* layerParams, intv& deviceIDs,
- int minibatchSize, bool conserveMem);
- ~ConvNet();
- void stop();
-
- Queue<Message*>& getMessageQueue();
- Queue<Worker*>& getWorkerQueue();
- Queue<WorkResult*>& getResultQueue();
- DataProvider& getDataProvider();
-
- Layer& getLayer(std::string& name, int replicaID);
- void copyToCPU();
- void copyToGPU();
- void updateWeights(int passIdx);
- void reset(int passIdx);
- void reset();
-
- void bprop(int passIdx, PASS_TYPE passType);
- void fprop(int miniIdx, int passIdx, PASS_TYPE passType);
- void fprop(CPUData& data, int passIdx, PASS_TYPE passType);
-
- void setTrainingProgress(double progress);
- double getTrainingProgress() const;
-
- bool checkGradient(const std::string& name, float eps, Weights& weights);
- void checkGradients();
- Cost& getCost();
- Cost& getCost(Cost& cost);
- CPUData& getData(); // Returns last minibatch fpropped
- double getCostValue();
- intv& getDeviceIDs();
- ThreadSynchronizer& getSync();
- void syncWithChildren();
- int getMinibatchSize();
- bool isConserveMemory();
- int getNumReplicasMax();
- int getNumReplicasMin();
- int getNumPasses();
- int getTotalPassesDone();
- PipeDispenser& getDataCopyPD();
-};
-
-class ConvNetThread : public Thread {
-protected:
- NameLayerMap _nameLayerMap;
- std::vector<CostLayer*> _costs;
- ConvNet* _convNet;
- int _deviceID;
- Queue<Message*> _msgQueue;
- Timer _timer;
-// StreamBroadcast* _weightSynchronizer;
-
- void initCuda();
- virtual void initLayer(PyObject* paramsDict, int replicaID);
- void* run();
-public:
- ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet);
- ~ConvNetThread();
-
- NameLayerMap& getLayerMap();
- int getDeviceID();
-
- ConvNet& getConvNet();
-
- Queue<Message*>& getMessageQueue();
- std::vector<CostLayer*>& getCostLayers();
-// StreamBroadcast& getWeightSynchronizer();
-
- Cost& getCost();
- Layer& getLayer(std::string& name);
- void startTimer();
- double stopTimer();
-};
-
-#endif /* CONVNET */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef COPYPIPELINE_CUH_
-#define COPYPIPELINE_CUH_
-
-#include <set>
-#include "../../util/include/thread.h"
-#include "../../util/include/queue.h"
-#include <helper_cuda.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "util.cuh"
-
-#define COPY_MIN_CHUNK_SIZE (1<<18) // 256k
-#define COPY_MAX_CHUNKS 16
-#define COPY_MIN_CHUNKS 2
-
-class CopyPeer;
-class CopySource;
-class ICopySegment;
-class IBroadcastNetwork;
-
-class CopyMessage {
-protected:
- std::map<int,NVMatrix*>* _mats;
- float _scaleSource, _scaleTargets;
-public:
- enum COPY_MESSAGE_TYPE {
- COPY_CHUNK,
- COPY_START,
- EXIT
- };
- CopyMessage(COPY_MESSAGE_TYPE msgType, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
- : _msgType(msgType), _scaleSource(scaleSource), _scaleTargets(scaleTargets), _mats(&mats) {
- }
- CopyMessage(COPY_MESSAGE_TYPE msgType)
- : _msgType(msgType), _scaleSource(0), _scaleTargets(0), _mats(NULL) {
- }
- inline COPY_MESSAGE_TYPE getType() const {
- return _msgType;
- }
- inline NVMatrix& getMatrix(int deviceID) const {
- return *_mats->at(deviceID);
- }
- inline std::map<int,NVMatrix*>& getMatrices() const {
- return *_mats;
- }
- inline float getScaleSource() const {
- return _scaleSource;
- }
- inline float getScaleTargets() const {
- return _scaleTargets;
- }
-protected:
- COPY_MESSAGE_TYPE _msgType;
-};
-
-class CopyChunkMessage : public CopyMessage {
-protected:
- int _chunkIdx;
- int _chunkSize;
- int _numChunks;
-public:
- CopyChunkMessage(int chunkIdx, int chunkSize, int numChunks, float scaleSource, float scaleTargets, std::map<int, NVMatrix*>& mats)
- : _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks), CopyMessage(COPY_CHUNK, scaleSource, scaleTargets, mats) {
- }
-
- inline int getChunkIdx() const {
- return _chunkIdx;
- }
- inline int getChunkSize() const {
- return _chunkSize;
- }
- inline int getNumChunks() const {
- return _numChunks;
- }
-};
-
-class CopyStartMessage : public CopyMessage {
-public:
- CopyStartMessage(float scaleSource, float scaleTargets, std::map<int,NVMatrix*>& mats) : CopyMessage(COPY_START, scaleSource, scaleTargets, mats) {
- }
-};
-
-class ICopySegment : public Thread {
-protected:
- int _deviceID, _execDeviceID;
- cudaStream_t _stream;
- ICopySegment* _prev;
- std::vector<CopyPeer*> _next;
- Queue<CopyMessage*> _queue;
- Queue<int>* _finishQueue;
- HostNVMatrix _hmat;
- IBroadcastNetwork* _parent;
-
- NVMatrix& getChunk(NVMatrix& mat, int chunkSize, int chunkIdx);
- void* run();
- virtual bool processMessage(CopyMessage& msg) = 0;
-
-public:
- ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
- virtual ~ICopySegment();
- inline NVMatrix& getMatrix(CopyMessage& msg);
- Queue<CopyMessage*>& getQueue();
- inline int getDeviceID();
- void addPrev(ICopySegment& c);
- void addNext(CopyPeer& c);
- bool isTerminal() const;
- virtual bool isSource() const = 0;
-};
-
-class CopySource : public ICopySegment {
-protected:
- bool processMessage(CopyMessage& msg);
-public:
- CopySource(IBroadcastNetwork& parent, int deviceID);
- inline bool isSource() const;
-};
-
-class CopyPeer : public ICopySegment {
-protected:
- bool processMessage(CopyMessage& msg);
-public:
- CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue);
- inline bool isSource() const;
-};
-
-class IBroadcastNetwork {
-protected:
- Queue<int> _finishQueue;
- CopySource* _src;
- std::vector<CopyPeer*> _peers;
- int _srcDeviceID, _numTerminal;
- bool _constructed;
- std::set<int> _devices;
- std::pair<std::vector<int>,std::vector<int> > makeGPULists();
-
- void makePeers(std::pair<std::vector<int>,std::vector<int> >& gpus);
- virtual void makeConnections() = 0;
- virtual void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
- IBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
-public:
- virtual IBroadcastNetwork& construct();
- virtual ~IBroadcastNetwork();
-
- virtual void broadcast(std::map<int, NVMatrix*>& mats);
- int getSourceDeviceID() const;
- static IBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
-};
-
-class ISafeBroadcastNetwork : public IBroadcastNetwork {
-protected:
- ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal);
-public:
- virtual void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
- virtual ISafeBroadcastNetwork& construct();
- static ISafeBroadcastNetwork& make(std::set<int> devices, int srcDeviceID);
-};
-
-class NullBroadcaster : public ISafeBroadcastNetwork {
-protected:
- NullBroadcaster(std::set<int>& devices, int srcDeviceID);
- void makeConnections();
-public:
- NullBroadcaster& construct();
- void broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
- void broadcast(std::map<int, NVMatrix*>& mats);
- friend class IBroadcastNetwork;
- friend class ISafeBroadcastNetwork;
-};
-
-/*
- * This one goes to host and then to targets.
- */
-class NaiveBroadcaster : public ISafeBroadcastNetwork {
-protected:
- NaiveBroadcaster(std::set<int>& devices, int srcDeviceID);
- void makeConnections();
- friend class IBroadcastNetwork;
- friend class ISafeBroadcastNetwork;
-};
-
-class EightGPUBroadcaster1 : public IBroadcastNetwork {
-protected:
- EightGPUBroadcaster1(std::set<int>& devices, int srcDeviceID);
- void makeConnections();
- friend class IBroadcastNetwork;
-};
-
-class TwoPeeringGPUsBroadcaster : public ISafeBroadcastNetwork {
-protected:
- int _tgtDeviceID;
- cudaStream_t _tgtStream;
- void makeConnections();
- void resetDeviceID(int d);
- void _broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets);
-public:
- TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID);
- ~TwoPeeringGPUsBroadcaster();
- ISafeBroadcastNetwork& construct();
- friend class IBroadcastNetwork;
- friend class ISafeBroadcastNetwork;
-};
-
-#endif /* COPYPIPELINE_CUH_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef COST_CUH
-#define COST_CUH
-
-#include <vector>
-#include <map>
-#include <helper_cuda.h>
-
-#include "layer.cuh"
-#include "util.cuh"
-
-class CostLayer;
-
-/*
- * Wrapper for dictionary mapping cost name to vector of returned values.
- */
-class Cost {
-protected:
- std::map<std::string,int> _numCases;
- CostMap _costMap;
- CostCoeffMap _costCoeffMap;
- std::map<std::string,int>& getNumCasesMap();
-public:
- Cost();
- Cost(std::vector<CostLayer*>& costs);
- doublev& operator [](const std::string s);
- CostMap& getCostMap();
- CostCoeffMap& getCostCoeffMap();
- int getNumCases();
- /*
- * Returns sum of first values returned by all the CostLayers, weighted by the cost coefficients.
- */
- double getValue();
- Cost& operator += (Cost& er);
- virtual ~Cost();
- void print();
-};
-
-
-#endif /* COST_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef DATA_CUH
-#define DATA_CUH
-
-#include <vector>
-#include <algorithm>
-#include "util.cuh"
-
-class CPUData {
-protected:
- MatrixV* _data;
- void assertDimensions() {
- assert(_data->size() > 0);
- for (int i = 1; i < _data->size(); i++) {
- assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
- if (_data->at(i-1)->isTrans() != _data->at(i)->isTrans() && _data->at(i)->getNumElements() < 2) {
- _data->at(i)->setTrans(_data->at(i-1)->isTrans());
- }
- assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
- }
- assert(_data->at(0)->getNumCols() > 0);
- }
-public:
- typedef typename MatrixV::iterator T_iter;
- // Cases in columns, but array may be transposed
- // (so in memory they can really be in rows -- in which case the array is transposed
- // during the copy to GPU).
- CPUData(PyObject* pyData) {
- _data = getMatrixV(pyData);
- assertDimensions();
- }
-
- CPUData(MatrixV* data) : _data(data) {
- assertDimensions();
- }
-
- ~CPUData() {
- for (T_iter it = _data->begin(); it != _data->end(); ++it) {
- delete *it;
- }
- delete _data;
- }
-
- Matrix& operator [](int idx) const {
- return *_data->at(idx);
- }
-
- int getSize() const {
- return _data->size();
- }
-
- MatrixV& getData() const {
- return *_data;
- }
-
- Matrix& getData(int i) const {
- return *_data->at(i);
- }
-
- bool isTrans() const {
- return _data->at(0)->isTrans();
- }
-
- int getNumCases() const {
- return _data->at(0)->getNumCols();
- }
-};
-
-class DataProvider {
-protected:
- CPUData* _hData;
- NVMatrixV _data;
- int _minibatchSize;
-public:
- DataProvider(int minibatchSize);
- void setData(CPUData&);
- void clearData();
- CPUData& getMinibatch(int idx);
- CPUData& getDataSlice(int startCase, int endCase);
- int getNumMinibatches();
- int getMinibatchSize();
- int getNumCases();
-};
-
-#endif /* DATA_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef GRADREDUCER_CUH_
-#define GRADREDUCER_CUH_
-
-#include <set>
-#include <algorithm>
-#include "streambroadcast.cuh"
-#include "reducepipeline.cuh"
-#include "layer.cuh"
-#include "util.cuh"
-
-class StreamBroadcast;
-class Layer;
-
-#define ACT_GRAD_REDUCER_EXIT (1 << 16)
-
-//class ReduceMessage {
-// ReduceMessage();
-// ReduceMessage(bool exit);
-//};
-
-class IActGradReducer : public Thread {
-protected:
- Layer* _parent;
- Queue<int> _finishQueue;
- int _numExpectedMsgsTotal;
- std::map<int,int> _numExpectedMsgs; // map from device id -> num expected msgs
-
- void* run();
- virtual bool reduce() = 0;
- virtual void reset() = 0;
-public:
- IActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
- virtual ~IActGradReducer();
- int waitForFinish();
- virtual void enqueueReduction(int deviceID) = 0;
- virtual void stop() = 0;
- static IActGradReducer& makeGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
-};
-
-class SequentialActGradReducer : public IActGradReducer {
-protected:
-
- std::map<int,int> _numReceivedMsgs; // map from device id -> num received msgs
-
- std::map<int,Queue<int>* > _messageQueues;
- intv _deviceIDs;
- StreamBroadcast* _broadcaster;
- bool reduce();
- void reset();
-public:
- SequentialActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
- ~SequentialActGradReducer();
- void enqueueReduction(int deviceID);
- void stop();
-};
-
-class ParallelActGradReducer : public IActGradReducer {
-protected:
- IEightGPUReducer* _reducer;
- int _numReceivedMsgs;
- float _scaleTarget;
- Queue<int> _messageQueue;
- bool reduce();
- void reset();
-public:
- ParallelActGradReducer(Layer& parent, std::map<int, int> numExpectedMsgs);
- void enqueueReduction(int deviceID);
- void stop();
-};
-
-
-#endif /* GRADREDUCER_CUH_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef JPEG_MAIN_H
-#define JPEG_MAIN_H
-
-#include <cstdio>
-#include <cstdlib>
-#include <Python.h>
-#include <vector>
-#include <string>
-#include <iostream>
-#include <jpeglib.h>
-//#include <arrayobject.h>
-#include "../../util/include/thread.h"
-#include "../../util/include/matrix.h"
-
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#define NUM_JPEG_DECODER_THREADS 4
-
-
-class DecoderThread : public Thread {
- protected:
- PyObject* _pyList;
- Matrix* _target;
- int64 _start_img, _end_img;
- int64 _img_size, _inner_size, _inner_pixels;
- bool _test, _multiview;
-
- unsigned char* _decodeTarget;
- int64 _decodeTargetSize;
- unsigned int _rseed;
-
- void* run();
- void decodeJpeg(int idx, int& width, int& height);
- double randUniform();
- double randUniform(double min, double max);
- void crop(int64 i, int64 width, int64 height, bool flip);
- virtual void crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y);
- public:
- DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview);
- virtual ~DecoderThread();
-};
-
-#endif // JPEG_MAIN_H
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LAYER_CUH
-#define LAYER_CUH
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include <map>
-#include <assert.h>
-#include <helper_timer.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-//#include "experimental/akrizhevsky/g3/mactruck-gpu-tests/gpu_util.cuh"
-
-#include "weights.cuh"
-#include "convnet.cuh"
-#include "cost.cuh"
-#include "neuron.cuh"
-#include "data.cuh"
-#include "layer_kernels.cuh"
-#include "streambroadcast.cuh"
-#include "actbroadcaster.cuh"
-#include "gradreducer.cuh"
-#include "util.cuh"
-#include "timer.cuh"
-#include "memorysource.cuh"
-
-class Cost;
-class ConvNet;
-class ConvNetThread;
-class CostLayer;
-class DataLayer;
-class Layer;
-class ActBroadcaster;
-class BroadcastMessage;
-class IActGradReducer;
-class Weights;
-class WeightList;
-typedef std::vector<Layer*> LayerV;
-
-class BinomialCrossEntOperator {
-protected:
- float _posWeight;
-public:
- BinomialCrossEntOperator(float posWeight) : _posWeight(posWeight) {
- }
- __device__ inline float operator()(const float t, const float y) const {
- return _posWeight * t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
- }
-};
-
-class CrossEntOperator {
-protected:
- float _posWeight;
-public:
- CrossEntOperator(float posWeight) : _posWeight(posWeight) {
- }
- __device__ inline float operator()(const float t, const float y) const {
- return _posWeight * t * safelog(y);
- }
-};
-
-/*
- * Abstract layer.
- */
-class Layer {
-protected:
- ConvNetThread* _convNetThread;
-
- // This is a vector[#layers_next]
- std::vector<Layer*> _next;
- // This is a vector[#replicas_prev][#layers_prev]
- std::map<int, std::vector<Layer*> > _prev;
-
- int _rcvdFInputMsgs;
- std::map<int, int> _numComputedActsGrads;
- int _rcvdBInputMsgs;
- int _numOutputs;
- std::map<int, NVMatrix*> _inputs; // input idx -> matrix
- std::map<int, MemoryView*> _memSrcActs; // device id -> memory source
- std::map<int, MemoryView*> _memSrcActsGrad; // device id -> memory source
-
- bool _gradConsumer, _foundGradConsumers, _trans;
- std::map<int,bool> _bwdTerminal; // One bool per pass
- int _numGradProducersNext;
- int _actsTarget, _actsGradTarget;
- std::string _name, _type;
- intv _nextDeviceIDs, _prevDeviceIDs;
- HostNVMatrix _hostMemFwd;
-
- // New replica-related stuff:
- std::map<int,Layer*> _replicas; // NOTE: a layer is its own sibling, too
- // Previous layers sorted by device ID, in reverse order in which they are procesed by
- // sequential grad reducer. map from replica -> device id -> layers
- std::map<int,std::map<int,std::set<Layer*> > > _prevByDevice;
- std::map<std::string, int> _inputIndices;
- int _replicaID;
- int _numReplicas;
- int _numReplicasPrev, _numReplicasNext;
-
- Queue<int> _broadcastFinishQueue;
- Queue<int> _reductionFinishQueue;
- ActBroadcaster* _actBroadcaster;
- IActGradReducer* _gradReducer;
- Timer _timer;
- bool _initialized;
-
- virtual void fpropNext(PASS_TYPE passType, int passIdx);
- virtual void truncBwdActs();
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) = 0;
-
- virtual void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
- // Do nothing by default
- }
- virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(!isGradProducer()); // Only do nothing if not grad producer
- }
- virtual void fpropCommon(PASS_TYPE passType) {
-
- }
- void bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx);
-
- ActBroadcaster& getActBroadcaster();
- IActGradReducer& getGradReducer();
- int getInputIdx(std::string& parentName);
- void setInputIdx(std::string& parentName, int idx);
-
-public:
- static bool _saveActsGrad, _saveActs;
-
- Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
- virtual ~Layer();
-
- virtual bool fprop(PASS_TYPE passType, int passIdx);
- void fprop(NVMatrix& v, int inpIdx, PASS_TYPE passType, int passIdx);
- virtual void fprop(std::map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx);
- virtual void bprop(PASS_TYPE passType, int passIdx);
- virtual void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
- virtual void reset();
- virtual void resetPassIdx();
- int getNumCases(NVMatrix& v);
- int& getNumComputedActsGrads(int deviceID);
- int incRcvdBInputMsgs();
- bool isGradConsumer();
- bool hasGradProducerNext(std::string& layerName);
- // Does this layer produce a gradient for any layer?
- virtual bool isGradProducer();
- // Does this layer produce a gradient for layer of given name?
- virtual bool isGradProducer(std::string& layerName);
- std::string& getName();
- std::string& getType();
- virtual void addNext(Layer& l);
- virtual void addPrev(Layer& l, int replicaIdx);
- virtual void addReplica(Layer& l);
- std::map<int,std::vector<Layer*> >& getPrev();
- std::vector<Layer*>& getNext();
- virtual NVMatrix& getActs();
- virtual NVMatrix& getActs(int deviceID);
- virtual NVMatrix& getActs(int deviceID, int numCases);
- virtual NVMatrix& getActsGrad();
- virtual NVMatrix& getActsGrad(int deviceID);
- virtual std::map<int,NVMatrix*> getAllActs();
- virtual std::map<int, NVMatrix*> getAllActsGrads();
- virtual bool postInit();
- int getDeviceID();
- ConvNetThread& getConvNetThread();
- cudaStream_t getStream();
- void syncStream();
- void setBwdTerminal(int passIdx);
- // Do nothing if this layer has no weights
- virtual bool updateWeights() {
- return false;
- }
- virtual bool constrainWeights() {
- return false;
- }
- virtual void checkGradient() {
- }
- virtual void copyToCPU() {
- }
- virtual void copyToGPU() {
- }
- intv& getNextDeviceIDs() {
- return _nextDeviceIDs;
- }
-
- int getReplicaID();
- int getNumReplicas();
- int getNumSiblingReplicas();
- int getNumReplicasPrev();
- int getNumReplicasNext();
- int getNumOutputs();
- void setMemorySourceActs(int deviceID, MemoryView& mem);
- void setMemorySourceActsGrad(int deviceID, MemoryView& mem);
- MemoryView& getMemorySourceActs(int deviceID);
- MemoryView& getMemorySourceActsGrad(int deviceID);
- int getFwdActiveInputReplicaIdx(int passIdx);
- int getBwdActiveInputReplicaIdx(int passIdx);
- int getFwdActiveReplicaIdx(int passIdx);
- int getNumLayersPrev();
- virtual int getNumInputReplicas();
- int getNumExpectedBwdMsgs();
- int getNumExpectedFwdMsgs();
- int getReplicaIdx();
- int getActivePassPeriod();
- int getNumGradProducersNext();
- virtual ConvNet& getConvNet();
-};
-
-class TwoDLayerInterface {
-protected:
- int _channels, _imgSize, _imgPixels;
-public:
- TwoDLayerInterface(PyObject* paramsDict);
-};
-
-class NeuronLayer : public Layer {
-protected:
- Neuron* _neuron;
- std::string _neuronType;
-
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- virtual bool bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- class CrossEntLogisticGradientOperator {
- private:
- float _coeff, _posWeight;
- public:
- CrossEntLogisticGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
- }
- __device__ inline float operator()(const float y, const float t) const {
- return _coeff * (_posWeight * t * (1.0f - y) + (t - 1.0f) * y);
- }
- };
- NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- ~NeuronLayer();
- std::string& getNeuronType();
-};
-
-class WeightLayer : public Layer {
-protected:
- WeightList* _weights;
- Weights *_biases;
- NVMatrix _norm2;
- float _wStep, _bStep;
- int _weightUpdatePassPeriod;
- void fpropCommon(PASS_TYPE passType);
- void bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType);
- virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
- virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) = 0;
- virtual void _constrainWeights();
- virtual float getGradScale(int inpIdx, PASS_TYPE passType);
- virtual float getIncScale(int inpIdx, PASS_TYPE passType);
- virtual float getBGradScale(PASS_TYPE passType);
- virtual float getBIncScale();
- virtual NVMatrix& getGradTarget(int inpIdx);
- NVMatrix& getWeightMatrix(PASS_TYPE passType, int inpIdx);
- NVMatrix& getBiasMatrix(PASS_TYPE passType);
-public:
- WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad);
- virtual ~WeightLayer();
- virtual bool updateWeights();
- virtual bool constrainWeights();
- virtual void copyToCPU();
- virtual void copyToGPU();
- virtual void checkGradient();
- Weights& getWeights(int idx);
- void addReplica(Layer& l);
- virtual bool postInit();
-};
-
-class FCLayer : public WeightLayer {
-protected:
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType);
- virtual void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
- virtual void _constrainWeights();
-public:
- FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
- FCLayer();
-};
-
-class SplitFCLayer : public FCLayer {
-protected:
- int _numParts;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-// void bpropBiases(NVMatrix& v, PASS_TYPE passType);
- void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
- void splitWeights();
-public:
- SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
-};
-
-class SoftmaxLayer : public Layer {
-protected:
- bool _doUpperGrad;
- NVMatrix _max, _sum;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- void setDoUpperGrad(bool b);
-};
-
-class ConcatenationLayer : public Layer {
-protected:
- intv* _copyOffsets;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- virtual ~ConcatenationLayer();
-};
-
-class PassThroughLayer : public Layer {
-protected:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- virtual bool postInit();
-};
-
-class EltwiseSumLayer : public Layer {
-protected:
- floatv* _coeffs;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- ~EltwiseSumLayer();
-};
-
-class EltwiseMaxLayer : public Layer {
-protected:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class SumLayer : public Layer {
-protected:
- int _stride;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- SumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class DataCopyMessage {
-public:
- enum MESSAGE_TYPE {
- COPY,
- EXIT
- };
-protected:
- CPUData* _cpuData;
- int _passIdx;
- bool _other;
- DataCopyMessage::MESSAGE_TYPE _type;
- DataCopyMessage(DataCopyMessage::MESSAGE_TYPE type) : _cpuData(NULL), _other(false), _passIdx(0), _type(type) {
- }
-public:
- DataCopyMessage(CPUData& cpuData, bool other, int passIdx) : _cpuData(&cpuData), _other(other), _passIdx(passIdx), _type(DataCopyMessage::COPY) {
- }
-
- CPUData& getData() const {
- return *_cpuData;
- }
-
- int getPassIdx() const {
- return _passIdx;
- }
-
- bool isOther() const {
- return _other;
- }
-
- DataCopyMessage::MESSAGE_TYPE getType() {
- return _type;
- }
-};
-
-class DataCopyExitMessage : public DataCopyMessage {
-public:
- DataCopyExitMessage() : DataCopyMessage(DataCopyMessage::EXIT) {
- }
-};
-
-class DataCopyThread;
-
-class DataLayer : public Layer {
-protected:
- bool _useBuffer;
- int _dataIdx;
- ConvNet* _convNet;
-// std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
- std::map<int, MemoryView*> _memSrcActs2; // // Buffer for copying data during computation
- std::map<int, cudaStream_t> _copyStreams;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- Queue<int> _copyFinishQueue;
- DataCopyThread* _copier;
- bool _outstandingCopyRequest;
- int _start, _end;
-
-public:
- void fprop(PASS_TYPE passType, int passIdx, bool fromBuffer);
- DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID);
- ~DataLayer();
- NVMatrix& getActs(int deviceID);
-// NVMatrix& getActs(int deviceID, bool other);
- NVMatrix& getActs(int deviceID, bool other, int numCases);
- bool isGradProducer();
- void toggleBuffer(int passIdx);
- void copyData(CPUData& data, bool other, int passIdx);
- bool postInit();
- ConvNet& getConvNet();
- int getNumInputReplicas();
- cudaStream_t getCopyStream(int deviceID);
- Queue<int>& getCopyFinishQueue() {
- return _copyFinishQueue;
- }
- void waitForCopyFinish();
- int getDataIdx() const {
- return _dataIdx;
- }
- int getStart() const {
- return _start;
- }
- int getEnd() const {
- return _end;
- }
-};
-
-
-class DataCopyThread : public Thread {
-protected:
- DataLayer* _parent;
- Queue<DataCopyMessage*> _queue;
- HostNVMatrix _hostMemFwd;
- Timer _requestTimer;
- int _sleepUsec;
- virtual void* run();
-
-public:
- DataCopyThread(DataLayer& parent, intv& cpus);
- Queue<DataCopyMessage*>& getQueue();
- void stop();
-};
-
-
-class LocalLayer : public WeightLayer {
-protected:
- intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
- intv* _imgPixels, *_filterPixels, *_filterChannels;
- int _modulesX, _modules, _numFilters;
-
-public:
- LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad);
- virtual ~LocalLayer();
-};
-
-class ConvLayer : public LocalLayer {
-protected:
- int _sumWidth;
- bool _sharedBiases;
- floatv* _weightContrastNormMin, *_weightContrastNormMax;
- NVMatrix _weightGradTmp;
-
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void bpropBiases(NVMatrix& v, PASS_TYPE passType);
- void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
- void truncBwdActs();
- void _constrainWeights();
-
-public:
- ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- virtual ~ConvLayer();
-};
-
-class LocalUnsharedLayer : public LocalLayer {
-protected:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void bpropBiases(NVMatrix& v, PASS_TYPE passType);
- void bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType);
- void _constrainWeights();
-public:
- LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class PoolLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _sizeX, _start, _stride, _outputsX;
- std::string _pool;
-public:
- PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
-
- static PoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class AvgPoolLayer : public PoolLayer {
-protected:
- bool _sum;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class MaxPoolLayer : public PoolLayer {
-protected:
- bool _abs;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs);
-};
-
-class CrossMapPoolLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _size, _start, _stride, _outputs;
- std::string _pool;
-public:
- CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
-
- static CrossMapPoolLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class CrossMapMaxPoolLayer : public CrossMapPoolLayer {
-protected:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class RandomScaleLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _tgtSize, _minScaledSize;
- float _maxScale; // should be >= 1
- NVMatrix _rescaledActs;
- std::vector<double> _scaleProbs;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class CropLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _tgtSize, _startX, _startY;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class NailbedLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _start, _stride, _outputsX;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class GaussianBlurLayer : public Layer, public TwoDLayerInterface {
-protected:
- Matrix* _hFilter;
- NVMatrix _filter;
- NVMatrix _actGradsTmp;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void copyToGPU();
-
- GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- ~GaussianBlurLayer();
-};
-
-class HorizontalReflectionLayer : public Layer, public TwoDLayerInterface {
-protected:
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID);
-};
-
-class ResizeLayer : public Layer, public TwoDLayerInterface {
-protected:
- float _scale;
- int _tgtSize;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class DropoutLayer : public Layer {
-protected:
- bool _enable;
- float _keep;
- NVMatrix _keepMask;
-public:
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- virtual void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void truncBwdActs();
- DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- class DropoutSmallerThanOperator {
- private:
- float _keep, _scale;
- public:
- DropoutSmallerThanOperator(float keep) : _keep(keep), _scale(1.0f/keep) {
- }
- __device__ inline float operator()(const float x) const {
- return (x < _keep) * _scale;
- }
- };
-};
-
-class Dropout2Layer : public DropoutLayer {
-protected:
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class RGBToYUVLayer : public Layer {
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class RGBToLABLayer : public Layer {
-protected:
- bool _center;
-public:
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-
- RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class ResponseNormLayer : public Layer, public TwoDLayerInterface {
-protected:
- int _size;
- float _scale, _pow;
- float _minDiv;
- NVMatrix _denoms;
-
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void truncBwdActs();
-public:
- ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class CrossMapResponseNormLayer : public ResponseNormLayer {
-protected:
- bool _blocked;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class ContrastNormLayer : public ResponseNormLayer {
-protected:
- NVMatrix _meanDiffs;
-
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
- void truncBwdActs();
-public:
- ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class CostLayer : public Layer {
-protected:
- float _coeff;
- doublev _costv;
- NVMatrix _tmpbuf; // For error accumulation
- int _numCases; // number of cases that the values in _costv were computed on
- bool _aggregated;
- void fpropCommon(PASS_TYPE passType);
-public:
- CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans);
- void bprop(NVMatrix& v, PASS_TYPE passType, int passIdx);
- bool fprop(PASS_TYPE passType, int passIdx);
-
- int getNumCases();
- virtual doublev& getCost();
- float getCoeff();
- bool isGradProducer();
- void setSendTerminalMessages(bool send);
- void resetPassIdx();
-
- static CostLayer& make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID);
-};
-
-/*
- * Input 0: labels
- * Input 1: softmax outputs
- */
-class CrossEntCostLayer : public CostLayer {
-protected:
- NVMatrix _trueLabelLogProbs, _correctProbs;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-/*
- * Input 0: labels
- * Input 1: softmax outputs
- */
-class LogregCostLayer : public CostLayer {
-protected:
- NVMatrix _trueLabelLogProbs, _correctProbs, _topkProbs;
- std::map<int,NVMatrix*> _probsAccum; // input replica idx -> nvmatrix
- NVMatrix _maxProbs;
- std::map<int,int> _numAccumed; // input replica idx -> int
- int _topk;
- bool _doCompute;
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- NVMatrix& getProbsAccum(int replicaIdx);
-};
-
-/*
- * Input 0: labels
- * Input 1: logistic outputs
- */
-class BinomialCrossEntropyCostLayer : public CostLayer {
-protected:
- bool _computeSoftmaxErrorRate;
- NVMatrix _tmpProbs, _tmpVec, _correctProbs;
- float _posWeight;
- virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
- float getPosWeight();
-
- // Only for use with non-logistic units
- class BinomialCrossEntGradientOperator {
- private:
- float _coeff, _posWeight;
- public:
- BinomialCrossEntGradientOperator(float coeff, float posWeight) : _coeff(coeff), _posWeight(posWeight) {
- }
- __device__ inline float operator()(const float t, const float y) const {
- return _coeff * (_posWeight * __fdividef(t, y) + __fdividef(t - 1.0f, 1.0f - y));
- }
- };
-};
-
-/*
- * Input 0: labels
- * Input 1: logistic outputs
- */
-class DetectionCrossEntropyCostLayer : public BinomialCrossEntropyCostLayer {
-protected:
- Matrix _hNumPositive, _hNumTruePositive, _hNumDeclaredPositive;
- NVMatrix _numPositive, _numTrueNegative, _numTruePositive, _numDeclaredPositive;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
-public:
- DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-class SumOfSquaresCostLayer : public CostLayer {
-protected:
- NVMatrix _tmp;
- void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx);
- void bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType);
-public:
- SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID);
-};
-
-#endif /* LAYER_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LAYER_KERNELS_CUH
-#define LAYER_KERNELS_CUH
-
-#include <vector>
-#include <helper_cuda.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-
-#define LOGREG_GRAD_THREADS_X 32
-#define LOGREG_GRAD_THREADS_Y 4
-
-#define LOGREG_ERR_THREADS_X 128
-#define LOGREG_ERR_THREADS_Y 1
-
-__device__ inline float safelog(const float x) {
- return x > 0.0f ? __logf(x) : -50.0f;
-}
-
-// The input matrix here is the squared norm.
-// This replaces the squared norm with:
-// 1 if it is below the threshold given by norm2
-// norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
-class MaxWeightConstraintOperator {
-private:
- float _norm, _norm2;
-public:
- MaxWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
- }
- __device__ inline float operator()(const float a) const {
- return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
- }
-};
-
-class HardWeightConstraintOperator {
-private:
- float _norm, _norm2;
-public:
- HardWeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
- }
- __device__ inline float operator()(const float a) const {
- return __fdividef(_norm, sqrtf(a));
- }
-};
-
-class WeightContrastNormOperator {
-private:
- float _min, _max, _scale;
-public:
- WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
- }
- __device__ inline float operator()(float a) const {
- a = sqrtf(a) * _scale;
- return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
- }
-};
-
-void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
-void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
-void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad);
-
-void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
-void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
-
-
-// Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
-// to avoi dividing and then multiplying by quantities that may be near zero.
-void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
-void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
-void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
-void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
- NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize);
-#endif /* LAYER_KERNELS_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LR_CUH
-#define LR_CUH
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <helper_cuda.h>
-#include <assert.h>
-#include <Python.h>
-#include "util.cuh"
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../../util/include/matrix.h"
-
-/*
- * The maximum learning rate is _baseRate.
- * The minimum learning rate is _baseRate / _tgtFactor.
- *
- * These classes define annealing schedules that interpolate between these
- * two extrema.
- */
-class ParameterSchedule {
-protected:
- double _baseRate;
-public:
- ParameterSchedule(double base);
- virtual double getValue(double progress);
- double getBaseValue() const;
- virtual ~ParameterSchedule();
-
- static ParameterSchedule& make(PyObject* schedDict);
-};
-
-class LinearParameterSchedule : public ParameterSchedule {
-protected:
- double _finalRate;
-public:
- LinearParameterSchedule(double base, double tgtFactor);
- virtual double getValue(double progress);
-};
-
-class ExpParameterSchedule : public ParameterSchedule {
-protected:
- double _powBase;
-public:
- ExpParameterSchedule(double baseRate, double tgtFactor);
- virtual double getValue(double progress);
-};
-
-class DiscreteExpParameterSchedule : public ParameterSchedule {
-protected:
- std::vector<double> _rates;
-public:
- DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps);
- virtual double getValue(double progress);
-};
-
-
-#endif /* LR_CUH */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <map>
-#include <set>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-
-class MemorySource;
-
-class MemoryView {
-protected:
- MemorySource* _src;
- std::string _name;
-public:
- MemoryView(MemorySource& src, std::string& name);
- ~MemoryView();
- NVMatrix& getMemory(int numCases);
- NVMatrix& getMemory();
- MemorySource& getMemorySource();
- bool isParent();
- std::string& getName();
- MemoryView& clone(std::string& name);
-};
-
-// Remember: PassThroughLayer, and therefore MemorySource, exists on a particular GPU.
-class MemorySource {
-protected:
-// int _inputIdx;
- NVMatrix _memory;
- int _deviceID;
- int _size;
- std::map<std::string, std::pair<int,int> > _viewRanges;
- std::map<std::string, NVMatrix*> _memoryViews; // input idx --> slice of _memory
- std::set<std::string> _truncateRequests;
- Lock _lock;
-public:
- MemorySource(int size, int deviceID);
- ~MemorySource();
- NVMatrix& getMemory(std::string& name, int numCases);
- NVMatrix& getMemory(std::string& name);
- MemoryView& addUser(std::string& name, std::pair<int,int> range);
- MemoryView& addUser(std::string& name);
- std::pair<int,int> getRange(std::string& name);
- int getSize();
- bool truncate(std::string& name);
- static MemoryView& make(int size, int deviceID, std::string& parentUser);
-};
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MESSAGES_CUH_
-#define MESSAGES_CUH_
-
-#include <string>
-#include "layer.cuh"
-
-class Layer;
-
-enum MESSAGES { FPROP_TERMINAL,
- BPROP_TERMINAL,
- BPROP_READY,
- FPROP_READY,
- SYNC,
- COPY_TO_CPU,
- COPY_TO_GPU,
- UPDATE_WEIGHTS,
- CONSTRAIN_WEIGHTS,
- RESET,
- RESET_PASS_IDX,
- COST_COMPUTED,
- BPROP_START,
- EXIT_CONVNET};
-
-class Message {
-protected:
- MESSAGES _messageType;
-public:
- MESSAGES getType() {
- return _messageType;
- }
- virtual Message* clone() {
- return new Message(_messageType);
- }
- Message(MESSAGES messageType) : _messageType(messageType) {
- }
- virtual ~Message() {
- }
-};
-
-class PropMessage : public Message {
-protected:
- Layer *_toLayer;
- PASS_TYPE _passType;
- int _passIdx;
-public:
-
- Layer& getToLayer() {
- return *_toLayer;
- }
-
- PASS_TYPE getPassType() {
- return _passType;
- }
-
- int getPassIdx() {
- return _passIdx;
- }
-
- virtual PropMessage* clone() {
- return new PropMessage(*_toLayer, _passType, _passIdx, _messageType);
- }
-
- PropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx, MESSAGES msgType)
- : _toLayer(&toLayer), _passType(passType), _passIdx(passIdx), Message(msgType) {
- }
-};
-
-class FpropMessage : public PropMessage {
-public:
- FpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
- : PropMessage(toLayer, passType, passIdx, FPROP_READY) {
- }
- virtual FpropMessage* clone() {
- return new FpropMessage(*_toLayer, _passType, _passIdx);
- }
-};
-
-class BpropMessage : public PropMessage {
-public:
- BpropMessage(Layer& toLayer, PASS_TYPE passType, int passIdx)
- : PropMessage(toLayer, passType, passIdx, BPROP_READY) {
- }
- virtual BpropMessage* clone() {
- return new BpropMessage(*_toLayer, _passType, _passIdx);
- }
-};
-
-class BpropStartMessage : public Message {
-protected:
- PASS_TYPE _passType;
- int _passIdx;
-public:
- PASS_TYPE getPassType() {
- return _passType;
- }
-
- int getPassIdx() {
- return _passIdx;
- }
-
- virtual BpropStartMessage* clone() {
- return new BpropStartMessage(_passType, _passIdx);
- }
-
- BpropStartMessage(PASS_TYPE passType, int passIdx)
- : _passType(passType), Message(BPROP_START), _passIdx(passIdx) {
- }
-};
-
-
-
-#endif /* MESSAGES_CUH_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NEURONS_CUH
-#define NEURONS_CUH
-
-#include <Python.h>
-#include <assert.h>
-#include <string>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include <helper_cuda.h>
-
-template <class GradientOp>
-class AddGradientBinaryOperator {
- GradientOp _op;
-public:
- AddGradientBinaryOperator(GradientOp op) : _op(op) {
- }
- __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
- return _op(unitActGrad, unitAct) + target;
- }
-};
-
-template <class GradientOp>
-class AddGradientOperator {
- GradientOp _op;
-public:
- AddGradientOperator(GradientOp op) : _op(op) {
- }
- __device__ inline float operator()(const float unitActGrad, const float target) const {
- return target + _op(unitActGrad);
- }
-};
-
-/* =======================
- * Neuron
- * -----------------------
- *
- * f(x) = x
- * =======================
- */
-class Neuron {
-protected:
- bool _activated;
- // Inputs and outputs potentially point to the same matrix, depending on the neuron
- NVMatrix* _inputs, *_outputs;
- virtual void _activate() {
- if (_inputs != _outputs) {
- _inputs->copy(*_outputs);
- }
- }
- virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- if (&target != &actsGrad) {
- actsGrad.copy(target);
- }
- }
- virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- if (&target != &actsGrad) {
- target.add(actsGrad);
- }
- }
-public:
- Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
- }
- virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
- _activated = true;
- _inputs = &inputs;
- _outputs = &outputs;
- _activate();
- }
-
- virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
- assert(_activated);
- if (!add) {
- target.resize(actsGrad);
- _computeInputGrad(actsGrad, target);
- } else {
- _addInputGrad(actsGrad, target);
- }
- }
-
- static Neuron& makeNeuron(PyObject* neuronDict);
-};
-
-/* =======================
- * LogisticNeuron
- * -----------------------
- *
- * f(x) = 1 / (1 + e^-x)
- * =======================
- */
-class LogisticNeuron : public Neuron {
-protected:
- void _activate() {
- _inputs->apply(NVMatrixOps::Logistic(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
- }
-public:
- class LogisticGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitAct) const {
- return unitActGrad * unitAct * (1.0f - unitAct);
- }
- };
-
- LogisticNeuron() : Neuron() {
- }
-};
-
-/* =======================
- * LogNeuron
- * -----------------------
- *
- * f(x) = log(eps + x)
- * =======================
- */
-class LogNeuron : public Neuron {
-protected:
- float _eps;
- void _activate() {
- _inputs->apply(LogOperator(_eps), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(LogGradientOperator(_eps), *_inputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<LogGradientOperator>(LogGradientOperator(_eps)), *_inputs, target, target);
- }
-public:
- class LogGradientOperator {
- protected:
- float _eps;
- public:
- __device__ inline float operator()(float unitActGrad, float unitInput) const {
- return __fdividef(unitActGrad, _eps + unitInput);
- }
- LogGradientOperator(float eps) : _eps(eps) {
-
- }
- };
-
- class LogOperator {
- protected:
- float _eps;
- public:
- __device__ inline float operator()(float x) const {
- return __logf(_eps + x);
- }
- LogOperator(float eps) : _eps(eps) {
-
- }
- };
-
- LogNeuron(float eps) : _eps(eps), Neuron() {
- }
-};
-
-/* =======================
- * ReluNeuron
- * -----------------------
- *
- * f(x) = max(0, x)
- * =======================
- */
-class ReluNeuron : public Neuron {
-protected:
- virtual void _activate() {
- _inputs->apply(ReluOperator(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
- }
-public:
- class ReluOperator {
- public:
- __device__ inline float operator()(float x) const {
- return x < 0.0f ? 0.0f : x;
- }
- };
-
- class ReluGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitAct) const {
- return unitActGrad * (unitAct > 0.0f);
- }
- };
-
- ReluNeuron() : Neuron() {
- }
-};
-
-
-/* =======================
- * BoundedReluNeuron
- * -----------------------
- *
- * f(x) = min(a, max(0, x))
- * =======================
- */
-class BoundedReluNeuron : public Neuron {
-protected:
- float _a;
-
- void _activate() {
- _inputs->apply(BoundedReluOperator(_a), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
- }
-public:
- class BoundedReluOperator {
- private:
- float _a;
- public:
- BoundedReluOperator(float a) : _a(a) {
- }
- __device__ inline float operator()(float x) const {
- return x < 0.0f ? 0.0f : x > _a ? _a : x;
- }
- };
-
- class BoundedReluGradientOperator {
- private:
- float _a;
- public:
- BoundedReluGradientOperator(float a) : _a(a) {
- }
- __device__ inline float operator()(float unitActGrad, float unitAct) const {
- return unitActGrad * (unitAct > 0.0f) * (unitAct < _a);
- }
- };
-
- BoundedReluNeuron(float a) : Neuron(), _a(a) {
- }
-};
-
-/* =======================
- * AbsNeuron
- * -----------------------
- *
- * f(x) = abs(x)
- * =======================
- */
-class AbsNeuron : public Neuron {
-protected:
- void _activate() {
- assert(_inputs != _outputs);
- _inputs->apply(NVMatrixOps::Abs(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
- }
-public:
- class AbsGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitInput) const {
- return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f);
- }
- };
-
- AbsNeuron() : Neuron() {
- }
-};
-
-/* =======================
- * TanhNeuron
- * -----------------------
- *
- * f(x) = a*tanh(b*x)
- * =======================
- */
-class TanhNeuron : public Neuron {
-protected:
- float _a, _b;
-
- void _activate() {
- _inputs->apply(TanhOperator(_a, _b), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
- }
-public:
- class TanhOperator {
- private:
- float _a, _n2b;
- public:
- TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
- }
- virtual __device__ inline float operator()(float x) const {
- return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
- }
- };
-
- class TanhGradientOperator {
- private:
- float _b, _a;
- public:
- TanhGradientOperator(float a, float b) : _b(b), _a(a) {
- }
- __device__ inline float operator()(float unitActGrad, float unitAct) const {
-// const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
-// return unitActGrad * _n4ab * (t * (t - 1.0f));
- return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
- }
- };
-
- TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
- }
-};
-
-/* =======================
- * DoubleReluNeuron
- * -----------------------
- *
- * f(x) = x - a*tanh(x/a)
- * =======================
- */
-class DoubleReluNeuron : public Neuron {
-protected:
- float _a;
-
- void _activate() {
- assert(_inputs != _outputs);
- _inputs->apply(DoubleReluOperator(_a), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
- }
-public:
- class DoubleReluOperator {
- private:
- float _a, _n2a;
- public:
- DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
- }
- virtual __device__ inline float operator()(float x) const {
- return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
- }
- };
-
- class DoubleReluGradientOperator {
- private:
- float _n2a;
- public:
- DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
- }
- __device__ inline float operator()(float unitActGrad, float unitInput) const {
- const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
- return unitActGrad * (tanh*tanh);
- }
- };
-
- DoubleReluNeuron(float a) : Neuron(), _a(a) {
- }
-};
-
-/* =======================
- * SoftReluNeuron
- * -----------------------
- *
- * f(x) = log(1 + e^x)
- * =======================
- */
-class SoftReluNeuron : public Neuron {
-protected:
- void _activate() {
-// assert(_inputs != _outputs);
- _inputs->apply(SoftReluOperator(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(SoftReluGradientOperator(), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_outputs, target, target);
- }
-public:
- class SoftReluOperator {
- public:
- __device__ inline float operator()(float x) const {
- // This piece-wise implementation has better numerical stability than
- // simply computing log(1 + e^x).
- return x > 4.0f ? x : __logf(1.0f + __expf(x));
- }
- };
-
- class SoftReluGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitOutput) const {
- if (unitOutput > 4.0f) {
- return unitActGrad;
- }
- const float f = __expf(-unitOutput);
- return unitActGrad * (1.0f - f);
- }
- };
-
- SoftReluNeuron() : Neuron() {
- }
-};
-
-/* =======================
- * SquareNeuron
- * -----------------------
- *
- * f(x) = x^2
- * =======================
- */
-class SquareNeuron : public Neuron {
-protected:
- void _activate() {
- assert(_inputs != _outputs);
- _inputs->apply(NVMatrixOps::Square(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
- }
-public:
- class SquareGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitInput) const {
- return unitActGrad * 2.0f * unitInput;
- }
- };
-
- SquareNeuron() : Neuron() {
- }
-};
-
-/* =======================
- * SqrtNeuron
- * -----------------------
- *
- * f(x) = sqrt(x)
- * =======================
- */
-class SqrtNeuron : public Neuron {
-protected:
- void _activate() {
- _inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
- }
-public:
- class SqrtGradientOperator {
- public:
- __device__ inline float operator()(float unitActGrad, float unitAct) const {
- return __fdividef(unitActGrad, 2.0f * unitAct);
- }
- };
-
- SqrtNeuron() : Neuron() {
- }
-};
-
-/* =======================
- * LinearNeuron
- * -----------------------
- *
- * f(x) = a*x + b
- * =======================
- */
-class LinearNeuron : public Neuron {
-protected:
- float _a, _b;
- void _activate() {
- _inputs->apply(NVMatrixOps::Linear(_a, _b), *_outputs);
- }
-
- void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.scale(_a, target);
- }
-
- void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
- actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
- }
-public:
- LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
- }
-};
-#endif /* NEURONS_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PIPEDISPENSER_CUH_
-#define PIPEDISPENSER_CUH_
-
-#include <pthread.h>
-#include <set>
-#include <algorithm>
-#include <iterator>
-#include "../../util/include/thread.h"
-#include "util.cuh"
-
-/*
- * PipeDispenser interface
- */
-class PipeDispenser {
-protected:
- int _numPipes;
- seti _pipes;
- pthread_mutex_t *_mutex;
-
- void lock() {
- pthread_mutex_lock(_mutex);
- }
-
- void unlock() {
- pthread_mutex_unlock(_mutex);
- }
-
- virtual void init() {
- _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
- pthread_mutex_init(_mutex, NULL);
- }
-public:
- PipeDispenser(const seti& pipes) {
- _pipes.insert(pipes.begin(), pipes.end());
- init();
- }
-
- PipeDispenser(int numPipes) {
- for (int i = 0; i < numPipes; ++i) {
- _pipes.insert(i);
- }
- init();
- }
-
- virtual ~PipeDispenser() {
- pthread_mutex_destroy(_mutex);
- free(_mutex);
- }
-
- virtual int getPipe(const seti& interested) = 0;
-
- int getPipe(int interested) {
- seti tmp;
- tmp.insert(interested);
- return getPipe(tmp);
- }
-
- virtual void freePipe(int pipe) = 0;
-};
-
-/*
- * This one blocks until there is a free pipe to return.
- */
-class PipeDispenserBlocking : public PipeDispenser {
-protected:
- pthread_cond_t *_cv;
-
- void wait() {
- pthread_cond_wait(_cv, _mutex);
- }
-
- void broadcast() {
- pthread_cond_broadcast(_cv);
- }
-
- int getAvailablePipes(const seti& interested, intv& available) {
- available.clear();
- std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
- return available.size();
- }
-
- virtual void init() {
- PipeDispenser::init();
- _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
- pthread_cond_init(_cv, NULL);
- }
-public:
- PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
- init();
- }
-
- PipeDispenserBlocking(int numPipes) : PipeDispenser(numPipes) {
- init();
- }
-
- ~PipeDispenserBlocking() {
- pthread_cond_destroy(_cv);
- free(_cv);
- }
-
- int getPipe(const seti& interested) {
- lock();
- intv avail;
- while (getAvailablePipes(interested, avail) == 0) {
- wait();
- }
- int pipe = avail[0];
- _pipes.erase(pipe);
- unlock();
- return pipe;
- }
-
- void freePipe(int pipe) {
- lock();
- _pipes.insert(pipe);
- broadcast();
- unlock();
- }
-};
-
-/*
- * This one returns the least-occupied pipe.
- */
-class PipeDispenserNonBlocking : public PipeDispenser {
-protected:
- std::map<int,int> _pipeUsers;
-
-public:
- PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
- for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
- _pipeUsers[*it] = 0;
- }
- }
-
- int getPipe(const seti& interested) {
- lock();
- int pipe = -1, users = 1 << 30;
- for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
- if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
- pipe = *it;
- users = _pipeUsers[*it];
- }
- }
- if (pipe >= 0) {
- _pipeUsers[pipe]++;
- }
- unlock();
- return pipe;
- }
-
- void freePipe(int pipe) {
- lock();
- _pipeUsers[pipe]--;
- unlock();
- }
-};
-
-
-#endif /* PIPEDISPENSER_CUH_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PYCONVNET3_CUH
-#define PYCONVNET3_CUH
-
-#define _QUOTEME(x) #x
-#define QUOTEME(x) _QUOTEME(x)
-
-extern "C" void init_ConvNet();
-
-PyObject* initModel(PyObject *self, PyObject *args);
-PyObject* startBatch(PyObject *self, PyObject *args);
-PyObject* finishBatch(PyObject *self, PyObject *args);
-PyObject* checkGradients(PyObject *self, PyObject *args);
-PyObject* syncWithHost(PyObject *self, PyObject *args);
-PyObject* startMultiviewTest(PyObject *self, PyObject *args);
-PyObject* startFeatureWriter(PyObject *self, PyObject *args);
-PyObject* startDataGrad(PyObject *self, PyObject *args);
-PyObject* decodeJpeg(PyObject *self, PyObject *args);
-
-#endif
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef REDUCEPIPELINE_CUH_H_
-#define REDUCEPIPELINE_CUH_H_
-
-#include "../../util/include/thread.h"
-#include "../../util/include/queue.h"
-#include <helper_cuda.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "util.cuh"
-
-#define REDUCE_MIN_CHUNK_SIZE (1<<18) // 256k
-#define REDUCE_MAX_CHUNKS 16
-#define REDUCE_MIN_CHUNKS 2
-
-enum REDUCE_MESSAGE_TYPE {
- REDUCE_CHUNK,
- REDUCE_START,
- EXIT
-};
-
-class ReducePeer;
-class ReducerSource;
-class IReduceSegment;
-class IEightGPUReducer;
-
-class ReduceMessage {
-protected:
- REDUCE_MESSAGE_TYPE _msgType;
- float _scaleIntermediates, _scaleTarget;
- std::map<int,NVMatrix*>* _mats;
-public:
- ReduceMessage(REDUCE_MESSAGE_TYPE msgType, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
- : _msgType(msgType), _scaleIntermediates(scaleIntermediates), _scaleTarget(scaleTarget), _mats(&mats) {
- }
- ReduceMessage(REDUCE_MESSAGE_TYPE msgType)
- : _msgType(msgType), _scaleIntermediates(0), _scaleTarget(0), _mats(NULL) {
- }
- inline REDUCE_MESSAGE_TYPE getType() const {
- return _msgType;
- }
- inline float getScaleIntermediates() const {
- return _scaleIntermediates;
- }
- inline float getScaleTarget() const {
- return _scaleTarget;
- }
- inline NVMatrix& getMatrix(int deviceID) const {
- return *_mats->at(deviceID);
- }
- inline std::map<int,NVMatrix*>& getMatrices() const {
- return *_mats;
- }
-};
-
-class ReduceChunkMessage : public ReduceMessage {
-protected:
- int _chunkIdx;
- int _chunkSize;
- int _numChunks;
-
- IReduceSegment* _src;
-public:
- ReduceChunkMessage(IReduceSegment& src, int chunkIdx, int chunkSize, int numChunks, float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
- : _src(&src), _chunkIdx(chunkIdx), _chunkSize(chunkSize), _numChunks(numChunks),
- ReduceMessage(REDUCE_CHUNK, scaleIntermediates, scaleTarget, mats) {
- }
-
- inline int getChunkIdx() const {
- return _chunkIdx;
- }
-
- inline int getChunkSize() const {
- return _chunkSize;
- }
-
- inline int getNumChunks() const {
- return _numChunks;
- }
-
- inline IReduceSegment& getSource() const {
- return *_src;
- }
-};
-
-class ReduceStartMessage : public ReduceMessage {
-public:
- ReduceStartMessage(float scaleIntermediates, float scaleTarget, std::map<int,NVMatrix*>& mats)
- : ReduceMessage(REDUCE_START, scaleIntermediates, scaleTarget, mats) {
- }
-};
-
-class IReduceSegment : public Thread {
-protected:
- int _deviceID;
- std::vector<IReduceSegment*> _prev;
- ReducePeer* _next;
- Queue<ReduceMessage*> _queue;
- Queue<int>* _finishQueue;
-
- NVMatrix& getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx);
- void* run();
- virtual bool processMessage(ReduceMessage& msg) = 0;
-
-public:
- IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
- virtual ~IReduceSegment();
- inline virtual NVMatrix& getMatrix(ReduceMessage& msg);
- Queue<ReduceMessage*>& getQueue();
- int getDeviceID() const;
- void addPrev(IReduceSegment& c);
- void addNext(ReducePeer& c);
- bool isTerminal() const;
-};
-
-class ReducerSource : public IReduceSegment {
-protected:
- bool processMessage(ReduceMessage& msg);
-public:
- ReducerSource(IEightGPUReducer& parent, int deviceID);
-};
-
-class ReducePeer : public IReduceSegment {
-protected:
- std::map<int,cudaStream_t> _streams; // device id -> stream
- std::map<int,int> _numInputsReceived; // chunk idx -> num inputs
- int _numInputsFinished;
- HostNVMatrix _mat;
- bool _add;
- bool processMessage(ReduceMessage& msg);
- inline cudaStream_t getStream(int deviceID);
- inline NVMatrix& getMatrix(ReduceMessage& msg);
- void hostAdd(const float* src, float* tgt, const int n, const float scaleTgt);
-public:
- ReducePeer(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue);
- ReducePeer(IEightGPUReducer& parent);
- ~ReducePeer();
-};
-
-class IEightGPUReducer {
-protected:
- std::vector<ReducerSource*> _sources;
- std::vector<ReducePeer*> _peers;
- Queue<int> _finishQueue;
- int _tgtDeviceID;
- virtual void makeConnections(std::vector<int>& same, std::vector<int>&other) = 0;
-public:
- IEightGPUReducer(int tgtDeviceID);
- virtual ~IEightGPUReducer();
- IEightGPUReducer& construct();
- void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget);
- void reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates);
- void reduce(std::map<int, NVMatrix*>& mats);
- int getTgtDeviceID() const;
-};
-
-class EightGPUReducer1 : public IEightGPUReducer {
-protected:
- void makeConnections(std::vector<int>& same, std::vector<int>&other);
-public:
- EightGPUReducer1(int tgtDeviceID);
-};
-
-class EightGPUReducer2 : public IEightGPUReducer {
-protected:
- void makeConnections(std::vector<int>& same, std::vector<int>&other);
-public:
- EightGPUReducer2(int tgtDeviceID);
-};
-
-#endif /* REDUCEPIPELINE_CUH_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef STREAMBROADCAST_CUH_
-#define STREAMBROADCAST_CUH_
-
-#include <iostream>
-#include "../../util/include/queue.h"
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "util.cuh"
-
-class Layer;
-
-//#define NUM_STREAM_COPY_PARTS 4
-// This is in 4-byte words, not bytes
-#define SB_MIN_CHUNK_SIZE (1<<17)
-#define SB_MAX_CHUNKS 16
-
-class StreamBroadcast {
-protected:
- std::map<int,cudaStream_t> _streams;
- std::set<int> _ownedStreams;
- HostNVMatrix _hostMem;
- void toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice);
- void toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput);
- void init(std::map<int,cudaStream_t>& streams);
- void init(std::map<int,NVMatrix*>& mats);
-public:
- StreamBroadcast(std::map<int,cudaStream_t>& streams);
- StreamBroadcast();
- virtual ~StreamBroadcast();
-
- void transfer(std::map<int,NVMatrix*>& mats, HostNVMatrix& hostmem, int srcDevice, float scaleTarget, float scaleOutput);
- void transfer(std::map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput);
- void transfer(std::map<int,NVMatrix*>& mats, int srcDevice);
- void sync(int deviceID);
- cudaStream_t getStream(int deviceID);
-};
-
-#endif /* STREAMBROADCAST_CUH_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef TIMER_CC_H_
-#define TIMER_CC_H_
-
-#include <helper_timer.h>
-
-class Timer {
-protected:
- StopWatchInterface* _timer;
- bool _started;
-
-public:
- Timer() : _started(false) {
- sdkCreateTimer(&_timer);
- }
-
- ~Timer() {
- sdkDeleteTimer(&_timer);
- }
- inline void start () {
- _started = true;
- sdkResetTimer(&_timer);
- sdkStartTimer(&_timer);
- }
-
- inline double stop() {
- sdkStopTimer(&_timer);
- _started = false;
- return sdkGetTimerValue(&_timer);
- }
-
- inline bool isStarted() const {
- return _started;
- }
-};
-
-#endif /* TIMER_CC_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef UTIL_H
-#define UTIL_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <vector>
-#include <map>
-#include <set>
-#include <string>
-#include <sstream>
-#include <string>
-#include <Python.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../../util/include/matrix.h"
-
-
-#define PASS_TYPE uint
-#define PASS_TRAIN 0x1
-#define PASS_TEST 0x2
-#define PASS_GC 0x4
-#define PASS_MULTIVIEW_TEST (PASS_TEST | 0x8)
-#define PASS_MULTIVIEW_TEST_START (PASS_MULTIVIEW_TEST | 0x10)
-#define PASS_MULTIVIEW_TEST_END (PASS_MULTIVIEW_TEST | 0x20)
-#define PASS_FEATURE_GEN 0x40
-
-#define HAS_FLAG(f, x) (((x) & (f)) == (f))
-#define IS_MULTIVIEW_TEST(x) HAS_FLAG(PASS_MULTIVIEW_TEST, x)
-#define IS_MULTIVIEW_TEST_START(x) HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
-#define IS_MULTIVIEW_TEST_END(x) HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
-#define IS_TEST(x) HAS_FLAG(PASS_TEST, x)
-#define IS_TRAIN(x) HAS_FLAG(PASS_TRAIN, x)
-
-// For gradient checking
-#define GC_SUPPRESS_PASSES false
-#define GC_REL_ERR_THRESH 0.02
-
-#ifdef DO_PRINT
-#define PRINT(x, args...) printf(x, ## args);
-#else
-#define PRINT(x, args...) ;
-#endif
-
-/*
- * Generates a random floating point number in the range 0-1.
- */
-#define randf ((float)rand() / RAND_MAX)
-
-//typedef std::vector<Matrix*> MatrixV;
-//typedef std::vector<NVMatrix*> NVMatrixV;
-typedef std::map<std::string,std::vector<double>*> CostMap;
-typedef std::map<std::string,double> CostCoeffMap;
-typedef std::vector<double> doublev;
-typedef std::vector<float> floatv;
-typedef std::vector<int> intv;
-typedef std::vector<std::string> stringv;
-typedef std::set<int> seti;
-typedef std::vector<PyObject*> PyObjectV;
-
-stringv* getStringV(PyObject* pyList);
-floatv* getFloatV(PyObject* pyList);
-intv* getIntV(PyObject* pyList);
-MatrixV* getMatrixV(PyObject* pyList);
-MatrixV* getMatrixV(PyObject* pyList, int len);
-int* getIntA(PyObject* pyList);
-
-int pyDictGetInt(PyObject* dict, const char* key);
-intv* pyDictGetIntV(PyObject* dict, const char* key);
-std::string pyDictGetString(PyObject* dict, const char* key);
-float pyDictGetFloat(PyObject* dict, const char* key);
-floatv* pyDictGetFloatV(PyObject* dict, const char* key);
-Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
-MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
-int* pyDictGetIntA(PyObject* dict, const char* key);
-stringv* pyDictGetStringV(PyObject* dict, const char* key);
-bool pyDictHasKey(PyObject* dict, const char* key);
-PyObjectV* pyDictGetValues(PyObject* dict);
-
-template<typename T> std::string tostr(T n);
-template<typename T> void shuffleVector(std::vector<T>& v, int start, int end);
-template<class T> void deleteElements(std::vector<T*>& v);
-template<class T> void deleteElements(std::vector<T*>& v, bool deleteContainer);
-
-template<class T>
-int indexOf(std::vector<T>& v, T e) {
- int i = 0;
-// typename vector<T>::iterator it2 = v.begin();
- for (typename std::vector<T>::const_iterator it = v.begin(); it != v.end(); ++it) {
- if (*it == e) {
- return i;
- }
- ++i;
- }
- return -1;
-}
-
-std::vector<int>& getDeviceCPUs(int deviceID);
-
-template<typename K, typename V> std::set<K> getKeys(std::map<K,V>& m) {
- std::set<K> s;
- for (typename std::map<K,V>::const_iterator it = m.begin(); it != m.end(); ++it) {
- s.insert(it->first);
- }
- return s;
-}
-
-struct LayerIDComparator {
- bool operator()(PyObject* i, PyObject* j) {
- return pyDictGetInt(i, "id") < pyDictGetInt(j, "id");
- }
-};
-
-#endif /* UTIL_H */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef WEIGHTS_CUH
-#define WEIGHTS_CUH
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <helper_cuda.h>
-#include <assert.h>
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../../util/include/matrix.h"
-#include "util.cuh"
-#include "lr.cuh"
-#include "layer.cuh"
-#include "copypipeline.cuh"
-#include "reducepipeline.cuh"
-#include "streambroadcast.cuh"
-
-class Layer;
-class Weights;
-class StreamBroadcast;
-
-class IWeightReducer {
-protected:
- int _tgtReplicaID;
- std::map<int,Weights*> _replicas;
-
- int getDeviceID();
-public:
- IWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
- virtual ~IWeightReducer();
- static IWeightReducer& make(std::map<int,Weights*>& replicas, int srcReplicaID);
- virtual void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) = 0;
-};
-
-class SequentialWeightReducer : public IWeightReducer {
-protected:
- StreamBroadcast* _sb;
-public:
- SequentialWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
- ~SequentialWeightReducer();
- void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
-};
-
-class ParallelWeightReducer : public IWeightReducer {
-protected:
- IEightGPUReducer* _reducer;
-public:
- ParallelWeightReducer(std::map<int,Weights*>& replicas, int srcReplicaID);
- ~ParallelWeightReducer();
- void reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc);
-};
-
-class Weights {
-protected:
- Matrix* _hWeights, *_hWeightsInc;
- NVMatrix* _weights, *_weightsInc, *_weightsGrad;
-
- ParameterSchedule* _lrs;
-
- float _wc, _mom, _wball;
- bool _onGPU, _useGrad, _cleanup;
- int _numUpdates;
-
- // Note: every layer is its own sibling too
- std::map<int,Weights*> _replicas;
-
- // Non-NULL if these weights are really shared from some other layer
- Weights* _srcWeights;
- Layer* _parent;
- int _shardSize;
- IWeightReducer* _reducer;
- ISafeBroadcastNetwork* _broadcaster;
-
- void aggregateReplicaGradients(float progress);
-
- // TODO: assert that these retrun contiguous views
- template<class T> T& getShard(T& mat, int replicaID);
- template<class T> T& getShard(T& mat);
- void init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc, float wball, float mom, bool useGrad, bool cleanup);
-
-public:
- NVMatrix& operator*() const;
-
- Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent);
- Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent,
- float wc, float wball, float mom, bool useGrad);
-
- virtual ~Weights();
-
- virtual NVMatrix& getW() const;
- virtual NVMatrix& getInc() const;
- virtual NVMatrix& getGrad() const;
- virtual Matrix& getCPUW() const;
- virtual Matrix& getCPUWInc() const;
- virtual ParameterSchedule& getLearningRateSchedule() const;
- virtual int getNumRows() const;
- virtual int getNumCols() const;
- virtual void copyToCPU();
-
- // This function is assumed to be called in the order in which the layers
- // were defined
- virtual void copyToGPU();
-
- virtual void update(float progress);
- virtual void addReplica(Weights& sibling);
- int incNumUpdates();
-
- // Returns the number of times a gradient has been computed for this
- // weight matrix during the current pass (interval between two calls of update())
- // through the net. This number will only be greater than 1 if this weight matrix
- // is *shared* by multiple layers in the net.
- int getNumUpdates() const;
- float getEps(float progress) const;
- float getMom() const;
- float getWC() const;
- float getWBall() const;
- bool isUseGrad() const;
- bool isOwner() const;
- int getReplicaID();
- int getDeviceID();
- Layer& getParent();
- std::map<int,Weights*>& getReplicas();
- ISafeBroadcastNetwork& getBroadcaster();
- IWeightReducer& getReducer();
-};
-
-class WeightList {
-private:
- std::vector<Weights*> _weightList;
-public:
- Weights& operator[](const int idx) const;
- ~WeightList();
- WeightList();
- Weights& at(const int i) const;
- void addWeights(Weights& w);
- void addReplica(WeightList& sibling);
- void update(float progress);
- void copyToCPU();
- void copyToGPU();
- int getSize() const;
-};
-
-#endif /* WEIGHTS_CUH */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef WORKER_CUH
-#define WORKER_CUH
-
-#include "convnet.cuh"
-#include "cost.cuh"
-#include "data.cuh"
-
-class ConvNet;
-class Cost;
-
-class WorkResult {
-public:
- enum RESULTS {BATCH_DONE, SYNC_DONE};
-protected:
- WorkResult::RESULTS _resultType;
- Cost* _results;
-public:
- WorkResult(WorkResult::RESULTS resultType, Cost& results);
- WorkResult(WorkResult::RESULTS resultType);
- virtual ~WorkResult();
- Cost& getResults() const;
- WorkResult::RESULTS getResultType() const;
-};
-
-class Worker {
-protected:
- ConvNet* _convNet;
-public:
- Worker(ConvNet& convNet);
- virtual ~Worker();
- virtual bool run() = 0;
-};
-
-class DataWorker : public Worker {
-protected:
- CPUData* _data;
- DataProvider* _dp;
-public:
- DataWorker(ConvNet& convNet, CPUData& data);
- virtual ~DataWorker();
- bool run();
- virtual void _run() = 0;
-};
-
-class TrainingWorker : public DataWorker {
-protected:
- bool _test;
- double _progress;
-public:
- TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
- void _run();
-};
-
-class SyncWorker : public Worker {
-public:
- SyncWorker(ConvNet& convNet);
- bool run();
-};
-
-class ExitWorker : public Worker {
-public:
- ExitWorker(ConvNet& convNet);
- bool run();
-};
-
-class GradCheckWorker : public DataWorker {
-public:
- GradCheckWorker(ConvNet& convNet, CPUData& data);
- void _run();
-};
-
-class MultiviewTestWorker : public DataWorker {
-protected:
- int _numViews;
- Matrix* _cpuProbs;
- std::string _logregName;
- CPUData& getMinibatch(int v, int i);
-public:
- MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
- MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
- ~MultiviewTestWorker();
- void _run();
-};
-
-class FeatureWorker : public DataWorker {
-protected:
- MatrixV *_ftrs;
- stringv *_layerNames;
- bool _deleteFeatures;
-public:
- FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures=true);
- ~FeatureWorker();
- void _run();
-};
-
-class DataGradWorker : public DataWorker {
-protected:
- Matrix* _dataGrads;
- int _dataLayerIdx, _softmaxLayerIdx;
-public:
- DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
- ~DataGradWorker();
- void _run();
-};
-
-#endif/* WORKER_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "../include/actbroadcaster.cuh"
-
-using namespace std;
-
-/*
- * =====================
- * BroadcastMessage
- * =====================
- */
-BroadcastMessage::BroadcastMessage(map<int, NVMatrix*> mats, int srcDevice, int userIdx, Queue<int>& finishQueue)
- : _type(BROADCAST), _mats(mats), _srcDevice(srcDevice), _userIdx(userIdx), _finishQueue(&finishQueue) {
-}
-
-BroadcastMessage::BroadcastMessage(MESSAGE_TYPE type)
- : _type(type), _finishQueue(NULL) {
-}
-
-int BroadcastMessage::getSrcDevice() {
- return _srcDevice;
-}
-
-map<int, NVMatrix*>& BroadcastMessage::getMatrices() {
- return _mats;
-}
-
-int BroadcastMessage::getUserIdx() {
- return _userIdx;
-}
-
-Queue<int>& BroadcastMessage::getFinishQueue() {
- return *_finishQueue;
-}
-
-BroadcastMessage::MESSAGE_TYPE BroadcastMessage::getMessageType() {
- return _type;
-}
-
-/*
- * =====================
- * ExitBroadcastMessage
- * =====================
- */
-ExitBroadcastMessage::ExitBroadcastMessage() : BroadcastMessage(BroadcastMessage::EXIT) {
-}
-
-/*
- * =====================
- * ActBroadcaster
- * =====================
- */
-ActBroadcaster::ActBroadcaster(int numUsers, intv& cpus) : Thread(true, cpus), _numUsers(numUsers) {
-}
-
-ActBroadcaster::~ActBroadcaster() {
- for (map<int,IBroadcastNetwork*>::const_iterator it = _broadcasters.begin(); it != _broadcasters.end(); ++it) {
- delete it->second;
- }
-}
-
-Queue<BroadcastMessage*>& ActBroadcaster::getMessageQueue() {
- return _messageQueue;
-}
-
-void* ActBroadcaster::run() {
- int nextUserIdx = 0;
- bool exit = false;
- while (!exit) {
- BroadcastMessage& msg = *_messageQueue.dequeue();
- if (msg.getMessageType() == BroadcastMessage::EXIT) {
- exit = true;
- delete &msg;
- } else {
- if (msg.getUserIdx() == nextUserIdx) {
- if (_broadcasters.count(msg.getSrcDevice()) == 0) {
- _broadcasters[msg.getSrcDevice()] = &IBroadcastNetwork::make(getKeys(msg.getMatrices()), msg.getSrcDevice());
- }
- _broadcasters[msg.getSrcDevice()]->broadcast(msg.getMatrices());
- msg.getFinishQueue().enqueue(0);
- delete &msg;
- nextUserIdx = (nextUserIdx + 1) % _numUsers;
- } else {
- _messageQueue.enqueue(&msg);
- }
- }
- }
- return NULL;
-}
-
-void ActBroadcaster::stop() {
- getMessageQueue().enqueue(new ExitBroadcastMessage());
- join();
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <iostream>
-#include <string>
-#include <set>
-#include <map>
-
-#include "../../nvmatrix/include/nvmatrix.cuh"
-#include "../../nvmatrix/include/nvmatrix_operators.cuh"
-#include "../../util/include/matrix.h"
-#include "../include/convnet.cuh"
-#include "../include/util.cuh"
-
-using namespace std;
-
-/*
- * =======================
- * ConvNet
- * =======================
- */
-ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs,
- int minibatchSize, bool conserveMem) : Thread(true) {
- _deviceIDs = deviceIDs;
- _data = NULL;
- _bufferData = NULL;
- _bufferMinibatchIdx = -1;
- _bufferPassIdx = -1;
- _trainingProgress = 0;
- _totalPassesDone = 0;
- _conserveMem = conserveMem;
- _sync = new ThreadSynchronizer(deviceIDs.size() + 1);
- PyObjectV* layerList = pyDictGetValues(layerParams);
- std::sort(layerList->begin(), layerList->end(), LayerIDComparator());
-
-
- _dataCopyPD = new PipeDispenserBlocking(DIVUP(_deviceIDs.size(),2)); // hard-coded for now
-
- initDataLayers(layerList);
- initGPUThreads(layerList);
- connectReplicas(); // Connect replicas to one another
- connectChildren(layerParams); // Connect forward/backward links in graph
- _numFwdTerminal = 0;
- // Execute post-initialization stuff
- for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
- for (int r = 0; r < it->second.size(); r++) {
- _numFwdTerminal += it->second[r]->getNext().size() == 0;
- if (it->second[r]->getNext().size() == 0) {
- printf("Fwd terminal: %s\n", it->second[r]->getName().c_str());
- }
- it->second[r]->postInit();
- }
- }
-
- // Find and count the terminal nodes in the backward pass
- for (int p = 0; p < getNumPasses(); p++) {
- set<Layer*> visited;
- _numBwdTerminal[p] = 0;
- for (int t = 0; t < _convNetThreads.size(); t++) {
- vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
- for (int c = 0; c < cl.size(); c++) {
- findBwdTerminal(*cl[c], visited, _numBwdTerminal[p], p);
- }
- }
- }
-
- _dp = new DataProvider(minibatchSize);
-// Py_DECREF(layerList);
- delete layerList;
-}
-
-ConvNet::~ConvNet() {
- for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
- (*it)->getMessageQueue().enqueue(new Message(EXIT_CONVNET));
- (*it)->join();
- delete *it;
- }
- for (DataLayerVector::const_iterator it = _dataLayers.begin(); it != _dataLayers.end(); ++it) {
- delete *it;
- }
- for (intv::const_iterator it = _deviceIDs.begin(); it != _deviceIDs.end(); ++it) {
- DEVICE_MEMORY_MANAGER::destroyInstance(*it);
- }
- HOST_MEMORY_MANAGER::destroyInstance();
- delete _sync;
- delete _dataCopyPD;
- delete _dp;
-}
-
-void ConvNet::stop() {
- getWorkerQueue().enqueue(new ExitWorker(*this));
- join();
-}
-
-PipeDispenser& ConvNet::getDataCopyPD() {
- return *_dataCopyPD;
-}
-
-void ConvNet::initDataLayers(PyObjectV* layerList) {
- for (int i = 0; i < layerList->size(); i++) {
- PyObject* paramsDict = layerList->at(i);
- std::string layerType = pyDictGetString(paramsDict, "type");
-
- if (layerType == "data") {
- int numReplicas = pyDictGetInt(paramsDict, "numReplicas");
- for (int r = 0; r < numReplicas; ++r) {
- DataLayer* dataLayer = new DataLayer(this, paramsDict, r);
- _dataLayers.push_back(dataLayer);
- _layerMap[dataLayer->getName()][r] = dataLayer;
- }
- }
- }
-}
-
-void ConvNet::initGPUThreads(PyObjectV* layerList) {
- // Initialize GPU worker threads
- for (int i = 0; i < _deviceIDs.size(); ++i) {
- ConvNetThread* cng = new ConvNetThread(layerList, _deviceIDs[i], i, this);
- _convNetThreads.push_back(cng);
- for (NameLayerMap::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
- const std::string& name = it->first;
- Layer* layer = it->second;
- _layerMap[name][layer->getReplicaID()] = layer;
- }
- }
-}
-
-void ConvNet::connectReplicas() {
- _numReplicasMax = 0;
- _numReplicasMin = 1 << 16;
- for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
- _numReplicasMax = max(_numReplicasMax, int(it->second.size()));
- _numReplicasMin = min(_numReplicasMin, int(it->second.size()));
- for (map<int,Layer*>::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2) {
- Layer& l1 = *it2->second;
- for (map<int,Layer*>::iterator it3 = it->second.begin(); it3 != it->second.end(); ++it3) {
- Layer& l2 = *it3->second;
- l1.addReplica(l2);
- }
- }
- }
-}
-
-void ConvNet::connectChildren(PyObject* layerParams) {
- for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
- PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
- PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
- if (inputList != NULL) {
- // Iterate over "replicas" of this layer
- int numReplicas = _layerMap[it->first].size();
- for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
- std::string inputName = PyString_AsString(PyList_GetItem(inputList, i));
- int numReplicasPrev = _layerMap[inputName].size();
- // How many replicas from the previous layer must this layer be connected to?
- int numInputReplicas = numReplicasPrev / numReplicas;
- for (int r = 0; r < numReplicas; r++) {
- for (int rp = r, ridx = 0; ridx < numInputReplicas; rp += numReplicas, ridx++) {
- it->second[r]->addPrev(*_layerMap[inputName][rp], ridx);
- _layerMap[inputName][rp]->addNext(*it->second[r]);
- }
- }
- }
- }
- }
-}
-
-void ConvNet::findBwdTerminal(Layer& l, set<Layer*>& visited, int& terminal, int passIdx) {
- if (visited.count(&l) == 0) {
- visited.insert(&l);
- if (l.isGradConsumer()) {
- bool hasPrevConsumer = false;
- if (l.getPrev().size() > 0) {
- for (int i = 0; i < l.getPrev()[0].size(); i++) {
- // Looking only at 0th replica is fine to see if you have
- // grad consumers below you.
- hasPrevConsumer |= l.getPrev()[0][i]->isGradConsumer();
- }
- }
- if (!hasPrevConsumer || !l.isGradProducer() || (passIdx + 1 < l.getNumReplicasPrev() && l.getNumReplicasPrev() > l.getNumReplicas())) {
- terminal++;
- l.setBwdTerminal(passIdx);
- printf("found bwd terminal %s[%d] in passIdx=%d\n", l.getName().c_str(), l.getReplicaID(), passIdx);
- } else if (l.isGradProducer()) {
- for (int r = 0; r < l.getPrev().size(); r++) {
- for (int i = 0; i < l.getPrev()[r].size(); i++) {
- findBwdTerminal(*l.getPrev()[r][i], visited, terminal, passIdx);
- }
- }
- }
- }
- }
-}
-
-void* ConvNet::run() {
- for (vector<ConvNetThread*>::const_iterator it = _convNetThreads.begin(); it != _convNetThreads.end(); ++it) {
- (*it)->start();
- }
- // The manager thread defaults to using the GPU of the first worker.
- // Put more logic here if this is inappropriate.
- NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
- copyToGPU();
- bool exit = false;
- while (!exit) {
- Worker* worker = _workerQueue.dequeue();
- exit = worker->run();
- delete worker;
- }
-
- return NULL;
-}
-
-Queue<Worker*>& ConvNet::getWorkerQueue() {
- return _workerQueue;
-}
-
-Queue<WorkResult*>& ConvNet::getResultQueue() {
- return _resultQueue;
-}
-
-DataProvider& ConvNet::getDataProvider() {
- return *_dp;
-}
-
-Layer& ConvNet::getLayer(std::string& name, int replicaID) {
- return *_layerMap[name][replicaID];
-}
-
-void ConvNet::sendMessage(MESSAGES msg, bool sync) {
- sendMessage(new Message(msg), sync);
-}
-
-void ConvNet::sendMessage(Message* msg, bool sync) {
- for (int i = 0; i < _convNetThreads.size(); i++) {
- _convNetThreads[i]->getMessageQueue().enqueue(msg->clone());
- }
-
- delete msg;
-
- if (sync) {
- syncWithChildren();
- }
-}
-
-void ConvNet::copyToCPU() {
- sendMessage(COPY_TO_CPU, true);
-}
-
-void ConvNet::copyToGPU() {
- sendMessage(COPY_TO_GPU, false);
-}
-
-void ConvNet::updateWeights(int passIdx) {
- sendMessage(UPDATE_WEIGHTS, true);
- sendMessage(CONSTRAIN_WEIGHTS, true);
-}
-
-void ConvNet::reset(int passIdx) {
- sendMessage((passIdx % getNumPasses()) == 0 ? RESET : RESET_PASS_IDX, false);
-}
-
-void ConvNet::reset() {
- reset(0);
-}
-
-// Fprop given data
-void ConvNet::fprop(CPUData& data, int passIdx, PASS_TYPE passType) {
- reset(passIdx);
- // This is necessary because setData below could delete data. If there's
- // an outstanding copy request, this'll cause a segfault.
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->waitForCopyFinish();
- }
-
- setData(data, passIdx);
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->fprop(passType, passIdx, false);
- }
- waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
-}
-
-// Fprop given minibatch idx
-void ConvNet::fprop(int miniIdx, int passIdx, PASS_TYPE passType) {
- reset(passIdx);
-
- bool fromBuffer = miniIdx == _bufferMinibatchIdx && passIdx == _bufferPassIdx;
- if (!fromBuffer) {
- // This is necessary because setData below could delete data. If there's
- // an outstanding copy request, this'll cause a segfault.
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->waitForCopyFinish();
- }
-
- setData(_dp->getMinibatch(miniIdx), passIdx);
-
- } else {
- setDataFromBuffer();
- }
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->fprop(passType, passIdx, fromBuffer);
- }
-
- if (passIdx == getNumPasses() - 1) {
- // Do double-buffering from next minibatch from the DataProvider
- setBuffer(miniIdx == _dp->getNumMinibatches() - 1 ? NULL : &_dp->getMinibatch(miniIdx + 1), miniIdx + 1, 0);
- } else {
- // Do double-buffering from next microbatch within current minibatch
- setBuffer(_data, miniIdx, passIdx + 1);
- }
-
- waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
-}
-
-void ConvNet::setDataFromBuffer() {
- if (_bufferData != _data) {
- delete _data;
- }
- _data = _bufferData;
- _bufferData = NULL;
- _bufferMinibatchIdx = -1;
- _bufferPassIdx = -1;
-}
-
-void ConvNet::setData(CPUData& data, int passIdx) {
- bool same = _data == _bufferData;
- if (&data != _data) {
- delete _data;
- }
- if (&data != _bufferData && !same) {
- delete _bufferData;
- _bufferData = NULL;
- _bufferMinibatchIdx = -1;
- _bufferPassIdx = -1;
- }
- _data = &data;
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->copyData(*_data, false, passIdx);
- }
-}
-
-void ConvNet::setBuffer(CPUData* bufferData, int bufferMinibatchIdx, int bufferPassIdx) {
- _bufferData = bufferData;
- _bufferMinibatchIdx = bufferMinibatchIdx;
- _bufferPassIdx = bufferPassIdx;
- if (bufferData != NULL) {
- for (int i = 0; i < _dataLayers.size(); i++) {
- _dataLayers[i]->copyData(*_bufferData, true, bufferPassIdx);
- }
- }
-}
-
-CPUData& ConvNet::getData() {
- assert(_data != NULL);
- return *_data;
-}
-
-void ConvNet::bprop(int passIdx, PASS_TYPE passType) {
- _totalPassesDone++;
- sendMessage(new BpropStartMessage(passType, passIdx), false);
- waitForTerminals(_numBwdTerminal[passIdx], BPROP_TERMINAL);
- reset(passIdx + 1);
-}
-
-void ConvNet::waitForTerminals(int numMsgs, MESSAGES msgType) {
- for (int rcvd = 0; rcvd < numMsgs; rcvd++) {
- Message* m = _msgQueue.dequeue();
- assert(m->getType() == msgType);
- delete m;
- }
-}
-
-// Same as getCost() but adds results to given cost and returns it
-Cost& ConvNet::getCost(Cost& cost) {
- Cost &tmp = getCost();
- cost += tmp;
- delete &tmp;
- return cost;
-}
-
-Cost& ConvNet::getCost() {
- Cost& cost = *new Cost();
- for (int t = 0; t < _convNetThreads.size(); t++) {
- Cost& tcost = _convNetThreads[t]->getCost();
- cost += tcost;
- delete &tcost;
- }
- return cost;
-}
-
-double ConvNet::getCostValue() {
- Cost& cost = getCost();
- double val = cost.getValue();
- delete &cost;
- return val;
-}
-
-Queue<Message*>& ConvNet::getMessageQueue() {
- return _msgQueue;
-}
-
-intv& ConvNet::getDeviceIDs() {
- return _deviceIDs;
-}
-
-ThreadSynchronizer& ConvNet::getSync() {
- return *_sync;
-}
-
-void ConvNet::syncWithChildren() {
- sendMessage(SYNC, false);
- _sync->sync();
-}
-
-int ConvNet::getTotalPassesDone() {
- return _totalPassesDone;
-}
-
-int ConvNet::getMinibatchSize() {
- return _dp->getMinibatchSize();
-}
-
-int ConvNet::getNumReplicasMax() {
- return _numReplicasMax;
-}
-
-int ConvNet::getNumReplicasMin() {
- return _numReplicasMin;
-}
-
-int ConvNet::getNumPasses() {
- return _numReplicasMax / _numReplicasMin;
-}
-
-void ConvNet::setTrainingProgress(double progress) {
- _trainingProgress = progress;
-}
-
-double ConvNet::getTrainingProgress() const {
- return _trainingProgress;
-}
-
-bool ConvNet::isConserveMemory() {
- return _conserveMem;
-}
-
-/*
- * Gradient checking stuff
- */
-void ConvNet::checkGradients() {
- _numFailures = 0;
- _numTests = 0;
- _baseErr = 0;
- for (int p = 0; p < getNumPasses(); ++p) {
- fprop(0, p, PASS_GC);
- _baseErr += getCostValue();
- bprop(p, PASS_GC);
- }
- // We call grad check only on the first replica,
- // but because weights are aware of their fellow replicas,
- // we can simultaneously perturb the weights of all
- // replicas.
- for (NameReplicaLayerMap::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
- map<int, Layer*>& layers = it->second;
- if (layers[0]->getDeviceID() >= 0 /*&& (layers[0]->getName() == "fc10")*/) { // If layer on GPU (data layers aren't)
- layers[0]->checkGradient();
- }
- }
-
- cout << "------------------------" << endl;
- if (_numFailures > 0) {
- cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
- } else {
- cout << "ALL " << _numTests << " TESTS PASSED" << endl;
- }
-}
-
-// Copies to all replicas
-void ConvNet::checkGradient_copyWeightsToGPU(Matrix& weightsCPU, Weights& weights) {
- int d = NVMatrix::getDeviceID();
- for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
- NVMatrix::setDeviceID(it->second->getDeviceID());
- it->second->getW().copyFromHost(weightsCPU);
- }
- NVMatrix::setDeviceID(d);
-}
-
-/*
- * name: weight matrix name
- * eps: finite difference step
- */
-bool ConvNet::checkGradient(const std::string& name, float eps, Weights& weights) {
- Matrix numGrad(weights.getNumRows(), weights.getNumCols());
- Matrix diff(numGrad);
- numGrad.apply(Matrix::ZERO);
- Matrix weightsCPU;
-
- weights.getW().copyToHost(weightsCPU, true);
-
- for(int i = 0; i < weights.getNumRows(); i++) {
- for (int j = 0; j < weights.getNumCols(); j++) {
- float v = weightsCPU(i,j);
- weightsCPU(i,j) += eps;
-
- checkGradient_copyWeightsToGPU(weightsCPU, weights);
-
- weightsCPU(i,j) = v;
- double err = 0;
- for (int p = 0; p < getNumPasses(); ++p) {
-// printf("trying fprop %d\n", p);
- fprop(0, p, PASS_GC);
-// printf(" success\n");
- err += getCostValue();
- }
- numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
- if (isnan((double)numGrad(i,j)) || isinf((double)numGrad(i,j))) {
- cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
- cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
- cout << "Exiting." << endl;
- exit(1);
- }
- checkGradient_copyWeightsToGPU(weightsCPU, weights);
- }
- }
- Matrix gradCPU;
- NVMatrix::setDeviceID(weights.getDeviceID());
- map<int,NVMatrix*> mats;
- for (map<int, Weights*>::const_iterator it = weights.getReplicas().begin(); it != weights.getReplicas().end(); ++it) {
- mats[it->first] = &it->second->getGrad();
- }
- weights.getReducer().reduce(mats, 1, false);
-
- weights.getGrad().copyToHost(gradCPU, true);
- gradCPU.scale(-1.0 / _data->getNumCases());
- float analNorm = gradCPU.norm();
- float numNorm = numGrad.norm();
- numGrad.subtract(gradCPU, diff);
- float relErr = diff.norm() / analNorm;
- bool fail = relErr >= GC_REL_ERR_THRESH;
- if (fail || !GC_SUPPRESS_PASSES) {
- cout << "========================" << endl;
- printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
- cout << "========================" << endl;
- cout << "Analytic:" << endl;
- gradCPU.print(0, 6, 0, 4);
- cout << "Numeric:" << endl;
- numGrad.print(0, 6, 0, 4);
- printf("Analytic norm: %e\n", analNorm);
- printf("Numeric norm: %e\n", numNorm);
- printf("Relative error: %e\n", relErr);
- }
- _numTests++;
- _numFailures += fail;
- return fail;
-}
-
-/*
- * =======================================================================================================
- * ConvNetThread
- * =======================================================================================================
- */
-ConvNetThread::ConvNetThread(PyObjectV* layerList, int deviceID, int deviceIdx, ConvNet* convNet)
- : Thread(true, getDeviceCPUs(deviceID)), _deviceID(deviceID), _convNet(convNet) {
- try {
- int numLayers = layerList->size();
-
- for (int i = 0; i < numLayers; i++) {
- PyObject* paramsDict = layerList->at(i);
- std::string layerType = pyDictGetString(paramsDict, "type");
- if (layerType != "data") {
- intv& gpus = *pyDictGetIntV(paramsDict, "gpu");
- int rid = indexOf(gpus, deviceIdx);
- if (rid >= 0) {
- initLayer(paramsDict, rid);
- }
- delete &gpus;
- }
- }
- } catch (std::string& s) {
- cout << "Error creating ConvNet: " << s << endl;
- exit(1);
- }
-}
-
-ConvNetThread::~ConvNetThread() {
- NVMatrix::setDeviceID(_deviceID);
- NVMatrix::destroyCublas();
- NVMatrix::destroyRandom();
- for (NameLayerMap::const_iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- delete it->second;
- }
- _nameLayerMap.clear();
-}
-
-void ConvNetThread::startTimer() {
- NVMatrix::syncStream();
- _timer.start();
-}
-
-double ConvNetThread::stopTimer() {
- NVMatrix::syncStream();
- return _timer.stop();
-}
-
-void ConvNetThread::initLayer(PyObject* paramsDict, int replicaID) {
- std::string type = pyDictGetString(paramsDict, "type");
- std::string name = pyDictGetString(paramsDict, "name");
- if (type == "fc") {
- _nameLayerMap[name] = new FCLayer(this, paramsDict, replicaID, false);
- } else if (type == "sfc") {
- _nameLayerMap[name] = new SplitFCLayer(this, paramsDict, replicaID, false);
- } else if (type == "conv") {
- _nameLayerMap[name] = new ConvLayer(this, paramsDict, replicaID);
- } else if (type == "local") {
- _nameLayerMap[name] = new LocalUnsharedLayer(this, paramsDict, replicaID);
- } else if (type == "pool") {
- _nameLayerMap[name] = &PoolLayer::make(this, paramsDict, replicaID);
- } else if (type == "cmpool") {
- _nameLayerMap[name] = &CrossMapPoolLayer::make(this, paramsDict, replicaID);
- } else if (type == "rnorm") {
- _nameLayerMap[name] = new ResponseNormLayer(this, paramsDict, replicaID);
- } else if (type == "cmrnorm") {
- _nameLayerMap[name] = new CrossMapResponseNormLayer(this, paramsDict, replicaID);
- } else if (type == "cnorm") {
- _nameLayerMap[name] = new ContrastNormLayer(this, paramsDict, replicaID);
- } else if (type == "softmax") {
- _nameLayerMap[name] = new SoftmaxLayer(this, paramsDict, replicaID);
- } else if (type == "eltsum") {
- _nameLayerMap[name] = new EltwiseSumLayer(this, paramsDict, replicaID);
- } else if (type == "eltmax") {
- _nameLayerMap[name] = new EltwiseMaxLayer(this, paramsDict, replicaID);
- } else if (type == "neuron") {
- _nameLayerMap[name] = new NeuronLayer(this, paramsDict, replicaID);
- } else if (type == "nailbed") {
- _nameLayerMap[name] = new NailbedLayer(this, paramsDict, replicaID);
- } else if (type == "blur") {
- _nameLayerMap[name] = new GaussianBlurLayer(this, paramsDict, replicaID);
- } else if (type == "href") {
- _nameLayerMap[name] = new HorizontalReflectionLayer(this, paramsDict, replicaID);
- } else if (type == "resize") {
- _nameLayerMap[name] = new ResizeLayer(this, paramsDict, replicaID);
- } else if (type == "rgb2yuv") {
- _nameLayerMap[name] = new RGBToYUVLayer(this, paramsDict, replicaID);
- } else if (type == "rgb2lab") {
- _nameLayerMap[name] = new RGBToLABLayer(this, paramsDict, replicaID);
- } else if (type == "rscale") {
- _nameLayerMap[name] = new RandomScaleLayer(this, paramsDict, replicaID);
- } else if (type == "crop") {
- _nameLayerMap[name] = new CropLayer(this, paramsDict, replicaID);
- } else if (type == "concat") {
- _nameLayerMap[name] = new ConcatenationLayer(this, paramsDict, replicaID);
- } else if (type == "pass") {
- _nameLayerMap[name] = new PassThroughLayer(this, paramsDict, replicaID);
- } else if (type == "dropout") {
- _nameLayerMap[name] = new DropoutLayer(this, paramsDict, replicaID);
- } else if (type == "dropout2") {
- _nameLayerMap[name] = new Dropout2Layer(this, paramsDict, replicaID);
- } else if (strncmp(type.c_str(), "cost.", 5) == 0) {
- CostLayer *c = &CostLayer::make(this, paramsDict, type, replicaID);
- _nameLayerMap[name] = c;
- _costs.push_back(c);
- } else {
- throw std::string("Unknown layer type ") + type;
- }
-}
-
-/*
- * This executes in a new CPU thread so it's OK to initialize CUDA stuff here.
- */
-void ConvNetThread::initCuda() {
- NVMatrix::setDeviceID(_deviceID);
- checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
- for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
- int d = _convNet->getDeviceIDs()[i];
- if (d != _deviceID) {
- if (NVMatrix::canAccessPeer(_deviceID, d)) {
- printf("Enabling peer access GPU %d --> GPU %d\n", NVMatrix::getDeviceID(), d);
- checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
- } else {
- printf("No peer access GPU %d --> GPU %d\n", _deviceID, d);
- }
- }
- }
-// NVMatrix::syncStream();
- NVMatrix::initCublas();
- NVMatrix::initRandom(/*7*/);
- srand(time(0));
-}
-
-void* ConvNetThread::run() {
- initCuda();
- bool exit = false;
- while (!exit) {
- Message* m = _msgQueue.dequeue();
- if (m->getType() == FPROP_READY) {
- FpropMessage* msg = static_cast<FpropMessage*>(m);
- msg->getToLayer().fprop(msg->getPassType(), msg->getPassIdx());
- } else if (m->getType() == BPROP_READY) {
- BpropMessage* msg = static_cast<BpropMessage*>(m);
- msg->getToLayer().incRcvdBInputMsgs();
- msg->getToLayer().bprop(msg->getPassType(), msg->getPassIdx());
- } else if (m->getType() == BPROP_START) {
- BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
- for (int i = 0; i < _costs.size(); i++) {
- dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType(), msg->getPassIdx());
- }
- } else if (m->getType() == SYNC) {
- NVMatrix::syncStream();
- _convNet->getSync().sync();
- } else if (m->getType() == COPY_TO_CPU) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->copyToCPU();
- }
- } else if (m->getType() == COPY_TO_GPU) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->copyToGPU();
- }
- } else if (m->getType() == RESET) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->reset();
- }
- } else if (m->getType() == RESET_PASS_IDX) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->resetPassIdx();
- }
- } else if (m->getType() == UPDATE_WEIGHTS) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->updateWeights();
- }
- } else if (m->getType() == CONSTRAIN_WEIGHTS) {
- for (NameLayerMap::iterator it = _nameLayerMap.begin(); it != _nameLayerMap.end(); ++it) {
- it->second->constrainWeights();
- }
- } else if (m->getType() == EXIT_CONVNET) {
- exit = true;
- }
- delete m;
- }
- return NULL;
-}
-
-Cost& ConvNetThread::getCost() {
- // In a single ConvNetThread, all costs are guaranteed to be different
- // (i.e. not replicas of one another)
- return *new Cost(_costs);
-}
-
-Layer& ConvNetThread::getLayer(std::string& name) {
- return *_nameLayerMap[name];
-}
-
-int ConvNetThread::getDeviceID() {
- return _deviceID;
-}
-
-Queue<Message*>& ConvNetThread::getMessageQueue() {
- return _msgQueue;
-}
-
-vector<CostLayer*>& ConvNetThread::getCostLayers() {
- return _costs;
-}
-
-NameLayerMap& ConvNetThread::getLayerMap() {
- return _nameLayerMap;
-}
-
-ConvNet& ConvNetThread::getConvNet() {
- return *_convNet;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/copypipeline.cuh"
-//#include "gpu_util.cuh"
-
-using namespace std;
-
-/* =========================
- * ICopySegment
- * =========================
- */
-ICopySegment::ICopySegment(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue)
- : _parent(&parent), _prev(NULL), _stream(NULL), _deviceID(deviceID), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getSourceDeviceID())) {
- _execDeviceID = _deviceID;
-}
-
-ICopySegment::~ICopySegment() {
- if (_stream != NULL) {
- checkCudaErrors(cudaStreamDestroy(_stream));
- }
-}
-
-void* ICopySegment::run() {
- assert(_execDeviceID != DEVICE_HOST);
- NVMatrix::setDeviceID(_execDeviceID);
- checkCudaErrors(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
- bool exit = false;
- while (!exit) {
- CopyMessage& msg = *_queue.dequeue();
- if (msg.getType() == CopyMessage::EXIT) {
- exit = true;
- } else {
- bool term = processMessage(msg);
- if (term) {
- assert(_finishQueue != NULL);
- _finishQueue->enqueue(1);
- }
- }
- delete &msg;
- }
- return NULL;
-}
-
-NVMatrix& ICopySegment::getChunk(NVMatrix& mat, int chunkSize, int chunkIdx) {
- NVMatrix& line = mat.reshaped(1, mat.getNumElements());
- int start = chunkIdx * chunkSize;
- int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
- NVMatrix& chunk = line.sliceCols(start, end);
- delete &line;
- return chunk;
-}
-
-inline NVMatrix& ICopySegment::getMatrix(CopyMessage& msg) {
- if (getDeviceID() == DEVICE_HOST) {
- return _hmat;
- }
- return msg.getMatrix(getDeviceID());
-}
-
-Queue<CopyMessage*>& ICopySegment::getQueue() {
- return _queue;
-}
-
-inline int ICopySegment::getDeviceID() {
- return _deviceID;
-}
-
-void ICopySegment::addPrev(ICopySegment& c) {
- _prev = &c;
- if (_deviceID == DEVICE_HOST) {
- _execDeviceID = c.getDeviceID();
- }
-}
-
-void ICopySegment::addNext(CopyPeer& c) {
- _next.push_back(&c);
- c.addPrev(*this);
-}
-
-bool ICopySegment::isTerminal() const {
- return _next.size() == 0;
-}
-
-/* =========================
- * CopySource
- * =========================
- */
-CopySource::CopySource(IBroadcastNetwork& parent, int deviceID) : ICopySegment(parent, deviceID, NULL) {
-}
-
-bool CopySource::processMessage(CopyMessage& msg) {
- assert(msg.getType() == CopyMessage::COPY_START);
- int numChunks = min(getMatrix(msg).getNumElements(), max(COPY_MIN_CHUNKS, min(COPY_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), COPY_MIN_CHUNK_SIZE))));
- int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
-// printf("num chunks: %d\n", numChunks);
- for (int c = 0; c <= numChunks; ++c) {
- for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
- (*it)->getQueue().enqueue(new CopyChunkMessage(c, chunkSize, numChunks, msg.getScaleSource(), msg.getScaleTargets(), msg.getMatrices()));
- }
- }
- return false;
-}
-
-inline bool CopySource::isSource() const {
- return true;
-}
-
-/* =========================
- * CopyPeer
- * =========================
- */
-CopyPeer::CopyPeer(IBroadcastNetwork& parent, int deviceID, Queue<int>* finishQueue) : ICopySegment(parent, deviceID, finishQueue) {
-}
-
-bool CopyPeer::processMessage(CopyMessage& msg) {
- assert(msg.getType() == CopyMessage::COPY_CHUNK);
- CopyChunkMessage& cmsg = *static_cast<CopyChunkMessage*>(&msg);
- if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
- if (!isTerminal() || (isTerminal() && msg.getScaleTargets() == 0)) {
- getMatrix(msg).resize(_prev->getMatrix(msg));
- }
-// getMatrix(msg).printShape("getMatrix(msg)");
-// _prev->getMatrix(msg).printShape("_prev->getMatrix(msg)");
- assert(getMatrix(msg).isSameDims(_prev->getMatrix(msg)));
- const float scaleSelf = isTerminal() ? msg.getScaleTargets() : 0;
- const float scalePrev = _prev->isSource() ? msg.getScaleSource() : 1;
- NVMatrix& prevChunk = getChunk(_prev->getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
- NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
- prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, _stream);
- NVMatrix::syncStream(_stream);
- delete &prevChunk;
- delete &myChunk;
- }
- for (vector<CopyPeer*>::const_iterator it = _next.begin(); it != _next.end(); ++it) {
- (*it)->getQueue().enqueue(new CopyChunkMessage(cmsg));
- }
- return cmsg.getChunkIdx() >= cmsg.getNumChunks() && isTerminal();
-}
-
-inline bool CopyPeer::isSource() const {
- return false;
-}
-
-/* =========================
- * IBroadcastNetwork
- * =========================
- */
-IBroadcastNetwork& IBroadcastNetwork::make(set<int> devices, int srcDevice) {
- if (devices.size() == 8) {
- return (new EightGPUBroadcaster1(devices, srcDevice))->construct();
- } else if (devices.size() == 1) {
- return (new NullBroadcaster(devices, srcDevice))->construct();
- } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
- return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
- }
- return (new NaiveBroadcaster(devices, srcDevice))->construct();
-}
-
-IBroadcastNetwork::IBroadcastNetwork(set<int>& devices, int srcDeviceID, int numTerminal)
- : _devices(devices), _srcDeviceID(srcDeviceID), _numTerminal(numTerminal), _constructed(false), _src(NULL) {
-}
-
-IBroadcastNetwork::~IBroadcastNetwork() {
- vector<ICopySegment*> v;
- v.insert(v.end(), _peers.begin(), _peers.end());
- v.insert(v.end(), _src);
- for (vector<ICopySegment*>::const_iterator it = v.begin(); it != v.end(); ++it) {
- (*it)->getQueue().enqueue(new CopyMessage(CopyMessage::EXIT));
- (*it)->join();
- delete *it;
- }
-}
-
-IBroadcastNetwork& IBroadcastNetwork::construct() {
- assert(!_constructed);
- pair<vector<int>,vector<int> > gpus = makeGPULists();
- _src = new CopySource(*this, _srcDeviceID);
- makePeers(gpus);
- makeConnections();
- _src->start();
- for (vector<CopyPeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
- (*it)->start();
- }
- _constructed = true;
- return *this;
-}
-
-pair<vector<int>,vector<int> > IBroadcastNetwork::makeGPULists() {
- vector<int> same, other;
- for (set<int>::const_iterator it = _devices.begin(); it != _devices.end(); ++it) {
- if (*it != _srcDeviceID) {
- if (NVMatrix::canAccessPeer(_srcDeviceID, *it)) {
- same.insert(same.begin() + rand() % (1 + same.size()), *it);
- } else {
- other.insert(other.begin() + rand() % (1 + other.size()), *it);
- }
- }
- }
- return pair<vector<int>,vector<int> >(same, other);
-}
-
-void IBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats) {
- _broadcast(mats, 1, 0);
-}
-
-void IBroadcastNetwork::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
- assert(_constructed);
- assert(_finishQueue.getNumElements() == 0);
- assert(mats.size() == _devices.size());
- assert(mats.size() > 1);
- if (mats[_srcDeviceID]->getNumElements() == 0) {
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- it->second->resize(*mats[_srcDeviceID]);
- }
- } else {
- _src->getQueue().enqueue(new CopyStartMessage(scaleSource, scaleTargets, mats));
- for (int i = 0; i < _numTerminal; ++i) {
- _finishQueue.dequeue();
- }
- }
- assert(_finishQueue.getNumElements() == 0);
-}
-
-int IBroadcastNetwork::getSourceDeviceID() const {
- return _srcDeviceID;
-}
-
-void IBroadcastNetwork::makePeers(pair<vector<int>,vector<int> >& gpus) {
- vector<int>& same = gpus.first, &other = gpus.second;
- for (int i = 0; i < same.size(); ++i) {
- _peers.push_back(new CopyPeer(*this, same[i], &_finishQueue));
- }
- for (int i = 0; i < other.size(); ++i) {
- _peers.push_back(new CopyPeer(*this, other[i], &_finishQueue));
- }
- _peers.push_back(new CopyPeer(*this, DEVICE_HOST, &_finishQueue)); // peers[7]
-}
-
-/* =========================
- * ISafeBroadcastNetwork
- * =========================
- */
-ISafeBroadcastNetwork& ISafeBroadcastNetwork::make(set<int> devices, int srcDevice) {
- if (devices.size() == 1) {
- return (new NullBroadcaster(devices, srcDevice))->construct();
- } else if (devices.size() == 2 && NVMatrix::canAccessPeer(*devices.begin(), *(++devices.begin()))) {
- return (new TwoPeeringGPUsBroadcaster(devices, srcDevice))->construct();
- }
- return (new NaiveBroadcaster(devices, srcDevice))->construct();
-}
-
-ISafeBroadcastNetwork::ISafeBroadcastNetwork(std::set<int>& devices, int srcDeviceID, int numTerminal) : IBroadcastNetwork(devices, srcDeviceID, numTerminal) {
-}
-
-void ISafeBroadcastNetwork::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
- _broadcast(mats, scaleSource, scaleTargets);
-}
-
-ISafeBroadcastNetwork& ISafeBroadcastNetwork::construct() {
- IBroadcastNetwork::construct();
- return *this;
-}
-
-/* =========================
- * NullBroadcaster
- * =========================
- */
-NullBroadcaster::NullBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
-}
-
-void NullBroadcaster::makeConnections() {
-}
-
-NullBroadcaster& NullBroadcaster::construct() {
- _constructed = true;
- return *this;
-}
-
-void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
-}
-
-void NullBroadcaster::broadcast(std::map<int, NVMatrix*>& mats) {
-}
-
-/* =========================
- * NaiveBroadcaster
- * =========================
- *
- * This one does src -> host -> all
- */
-NaiveBroadcaster::NaiveBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, devices.size()-1) {
-}
-
-void NaiveBroadcaster::makeConnections() {
- _src->addNext(*_peers.back()); // Make connection src -> host
- for (int i = 0; i < _peers.size() - 1; ++i) {
- if (_peers[i]->getDeviceID() != _src->getDeviceID()) {
- _peers.back()->addNext(*_peers[i]); // Make connection host -> peer
- }
- }
-}
-
-/* =========================
- * EightGPUBroadcaster1
- * =========================
- *
- * This one does a fancy graph
- */
-EightGPUBroadcaster1::EightGPUBroadcaster1(set<int>& devices, int srcDeviceID) : IBroadcastNetwork(devices, srcDeviceID, 4) {
-}
-
-void EightGPUBroadcaster1::makeConnections() {
- _src->addNext(*_peers[7]);
- _peers[7]->addNext(*_peers[0]);
- _peers[7]->addNext(*_peers[1]);
- _peers[7]->addNext(*_peers[3]);
- _peers[7]->addNext(*_peers[4]);
-
- _peers[1]->addNext(*_peers[2]);
- _peers[3]->addNext(*_peers[5]);
- _peers[4]->addNext(*_peers[6]);
-}
-
-/* =========================
- * TwoPeeringGPUsBroadcaster
- * =========================
- */
-TwoPeeringGPUsBroadcaster::TwoPeeringGPUsBroadcaster(std::set<int>& devices, int srcDeviceID) : ISafeBroadcastNetwork(devices, srcDeviceID, 0) {
- _tgtDeviceID = *devices.begin() == srcDeviceID ? *(++devices.begin()) : *devices.begin();
-}
-
-TwoPeeringGPUsBroadcaster::~TwoPeeringGPUsBroadcaster() {
- if (_constructed) {
- checkCudaErrors(cudaStreamDestroy(_tgtStream));
- }
-}
-
-void TwoPeeringGPUsBroadcaster::makeConnections() {
-}
-
-void TwoPeeringGPUsBroadcaster::resetDeviceID(int d) {
- if (d >= 0) {
- NVMatrix::setDeviceID(d);
- }
-}
-
-ISafeBroadcastNetwork& TwoPeeringGPUsBroadcaster::construct() {
- assert(!_constructed);
- int d = NVMatrix::getDeviceID();
- NVMatrix::setDeviceID(_tgtDeviceID);
- checkCudaErrors(cudaStreamCreateWithFlags(&_tgtStream, cudaStreamNonBlocking));
- resetDeviceID(d);
- _constructed = true;
- return *this;
-}
-
-void TwoPeeringGPUsBroadcaster::_broadcast(std::map<int, NVMatrix*>& mats, float scaleSource, float scaleTargets) {
- int d = NVMatrix::getDeviceID();
- NVMatrix::setDeviceID(_tgtDeviceID);
- mats[_tgtDeviceID]->add(*mats[_srcDeviceID], scaleTargets, scaleSource, *mats[_tgtDeviceID], _tgtStream);
- NVMatrix::syncStream(_tgtStream);
- resetDeviceID(d);
-}
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "../include/cost.cuh"
-
-using namespace std;
-
-/*
- * =====================
- * Cost
- * =====================
- */
-
-Cost::Cost() {
-}
-
-Cost::Cost(vector<CostLayer*>& costs) {
- for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
- _costMap[(*it)->getName()] = &(*it)->getCost();
- _costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
- _numCases[(*it)->getName()] = (*it)->getNumCases();
- }
-}
-
-int Cost::getNumCases() {
- return _numCases.size() == 0 ? 0 : _numCases.begin()->second;
-}
-
-map<std::string,int>& Cost::getNumCasesMap() {
- return _numCases;
-}
-
-doublev& Cost::operator [](const std::string s) {
- return *_costMap[s];
-}
-
-CostMap& Cost::getCostMap() {
- return _costMap;
-}
-
-CostCoeffMap& Cost::getCostCoeffMap() {
- return _costCoeffMap;
-}
-
-double Cost::getValue() {
- double val = 0;
- for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
- val += _costCoeffMap[it->first] * (it->second->size() == 0 ? 0 : it->second->at(0));
- }
- return val;
-}
-
-Cost& Cost::operator += (Cost& er) {
- CostMap& otherMap = er.getCostMap();
- CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
-
- for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
- bool newCost = _costMap.count(it->first) == 0;
- if (newCost) {
- _costMap[it->first] = new doublev();
- _costCoeffMap[it->first] = otherCoeffMap[it->first];
- _numCases[it->first] = er.getNumCasesMap()[it->first];
- } else {
- _numCases[it->first] += er.getNumCasesMap()[it->first];
- }
-
- doublev& myVec = *_costMap[it->first];
- doublev& otherVec = *otherMap[it->first];
- assert(myVec.size() == 0 || otherVec.size() == 0 || myVec.size() == otherVec.size());
- // Add costs from otherVec to me
- for (int i = 0; i < otherVec.size(); i++) {
- if (myVec.size() <= i) {
- myVec.push_back(0);
- }
- myVec[i] += otherVec[i];
- }
- }
- return *this;
-}
-
-Cost::~Cost() {
- for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
- delete it->second;
- }
-}
-
-void Cost::print() {
- for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
- printf("%s (%.3f): ", it->first.c_str(), _costCoeffMap[it->first]);
- doublev& vec = *_costMap[it->first];
- for (int z = 0; z < vec.size(); ++z) {
- printf("%.3f", vec[z]);
- if (z < vec.size() - 1) {
- printf(", ");
- }
- }
- printf("\n");
- }
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include <vector>
-#include "../../util/include/matrix.h"
-#include "../include/data.cuh"
-#include "../include/timer.cuh"
-
-using namespace std;
-
-DataProvider::DataProvider(int minibatchSize) :
- _minibatchSize(minibatchSize), _hData(NULL) {
-}
-
-void DataProvider::clearData() {
- delete _hData;
- _hData = NULL;
-}
-
-void DataProvider::setData(CPUData& hData) {
- // DataWorker calls clearData
- _hData = &hData;
- assert(_hData != NULL);
-}
-
-CPUData& DataProvider::getMinibatch(int idx) {
- assert(idx >= 0 && idx < getNumMinibatches());
- return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
-}
-
-CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
- assert(_hData != 0);
- assert(_hData->getNumCases() > 0);
- endCase = min(_hData->getNumCases(), endCase);
- // TODO: maintain these matrices, no point re-creating them all the time
- MatrixV& miniData = *new MatrixV();
-
- for (int i = 0; i < _hData->getData().size(); i++) {
- // NOTE: if hData is transposed, then the output minibatch matrix
- // can be a view. No need to allocate new CPU memory here. Might
- // want to look into optimizing that in the future, though it's
- // unlikely to be a big deal.
- if (_hData->isTrans()) {
- miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
- } else {
- miniData.push_back(new Matrix());
- (*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
- }
- }
- CPUData& cpuData = *new CPUData(&miniData);
- return *new CPUData(&miniData);
-}
-
-int DataProvider::getNumMinibatches() {
- assert(_hData != 0);
- assert(_hData->getNumCases() > 0);
- return DIVUP(_hData->getNumCases(), _minibatchSize);
-}
-
-int DataProvider::getMinibatchSize() {
- return _minibatchSize;
-}
-
-int DataProvider::getNumCases() {
- assert(_hData != 0);
- assert(_hData->getNumCases() > 0);
- return _hData->getNumCases();
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/util.cuh"
-#include "../include/gradreducer.cuh"
-
-using namespace std;
-
-/* =====================
- * IGradReducer
- * =====================
- */
-IActGradReducer::IActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
- : Thread(true, getDeviceCPUs(parent.getDeviceID())), _parent(&parent), _numExpectedMsgs(numExpectedMsgs) {
- _numExpectedMsgsTotal = 0;
- for (map<int,int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
- _numExpectedMsgsTotal += it->second;
- }
-// printf("%s[%d] expected %d backward msgs\n", parent.getName().c_str(), parent.getReplicaID(), _numExpectedMsgsTotal);
-}
-
-IActGradReducer::~IActGradReducer() {
-
-}
-
-void* IActGradReducer::run() {
- while (true) {
- reset();
- if (reduce()) {
- break;
- }
- _finishQueue.enqueue(0);
- }
- return NULL;
-}
-
-// Cost layer will have nothing to dequeue, so just return immediately.
-int IActGradReducer::waitForFinish() {
- if (_numExpectedMsgsTotal > 0) {
- int i = _finishQueue.dequeue();
- assert(_finishQueue.getNumElements() == 0);
- return i;
- }
-// printf("%s not waiting for finish\n", _name.c_str());
- return 0;
-}
-
-IActGradReducer& IActGradReducer::makeGradReducer(Layer& parent, map<int, int> numExpectedMsgs) {
- int tgtDeviceID = parent.getDeviceID();
- if (numExpectedMsgs.count(tgtDeviceID) == 0) {
- numExpectedMsgs[tgtDeviceID] = 0;
- }
- if (numExpectedMsgs.size() == 8) {
- return *new ParallelActGradReducer(parent, numExpectedMsgs);
- }
- return *new SequentialActGradReducer(parent, numExpectedMsgs);
-}
-
-/* =====================
- * SequentialGradReducer
- * =====================
- */
-SequentialActGradReducer::SequentialActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
- : IActGradReducer(parent, numExpectedMsgs) {
- intv deviceIDs;
- int tgtDeviceID = parent.getDeviceID();
- for (map<int, int>::const_iterator it = numExpectedMsgs.begin(); it != numExpectedMsgs.end(); ++it) {
- if (it->first != tgtDeviceID) {
- deviceIDs.push_back(it->first);
- }
- }
- if (numExpectedMsgs[tgtDeviceID] > 0) {
- deviceIDs.push_back(tgtDeviceID);
- }
-
- sort(deviceIDs.begin(), deviceIDs.end());
-
- int firstDeviceIdx = 0, firstDeviceID = 1 << 16;
- for (int i = 0; i < deviceIDs.size(); ++i) {
- if (deviceIDs[i] >= tgtDeviceID && deviceIDs[i] < firstDeviceID) {
- firstDeviceIdx = i;
- firstDeviceID = deviceIDs[i];
- }
- }
-
- // This is the order in which we process devices.
- for (int i = firstDeviceIdx; _deviceIDs.size() < deviceIDs.size(); i = (i + 1) % deviceIDs.size()) {
- int d = deviceIDs[i];
- _deviceIDs.push_back(d);
- _messageQueues[d] = new Queue<int>();
- }
- //shuffleVector(_deviceIDs, 1, _deviceIDs.size());
- _broadcaster = new StreamBroadcast();
-
- // Note that we MUST process the tgtDeviceID first because
- // we write to it at every iteration, and the computation
- // thread writes to it too. By processing it first we ensure
- // that there's no race condition.
- assert(numExpectedMsgs[tgtDeviceID] == 0 || _deviceIDs[0] == tgtDeviceID);
- reset();
-}
-
-SequentialActGradReducer::~SequentialActGradReducer() {
- for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
- delete it->second;
- }
- delete _broadcaster;
-}
-
-void SequentialActGradReducer::reset() {
- for (map<int,int>::iterator it = _numReceivedMsgs.begin(); it != _numReceivedMsgs.end(); ++it) {
- _numReceivedMsgs[it->first] = 0;
- }
-}
-
-bool SequentialActGradReducer::reduce() {
- int tgtDeviceID = _parent->getDeviceID();
- for (int didx = 0; didx < _deviceIDs.size(); ) {
- int d = _deviceIDs[didx];
- _numReceivedMsgs[d] += _messageQueues[d]->dequeue();
- if (_numReceivedMsgs[d] == _numExpectedMsgs[d]) {
- if (d != tgtDeviceID) {
- NVMatrix::setDeviceID(tgtDeviceID);
-
- _parent->getActsGrad().resize(_parent->getActsGrad(d));
- map<int, NVMatrix*> mats;
- mats[d] = &_parent->getActsGrad(d);
- mats[tgtDeviceID] = &_parent->getActsGrad(tgtDeviceID);
-
- _broadcaster->transfer(mats, d, didx > 0, 1);
- }
- didx++;
- assert(_messageQueues[d]->getNumElements() == 0);
- } else if (_numReceivedMsgs[d] >= _numExpectedMsgs[d]) { // exit
- return true;
- }
- }
- return false;
-}
-
-void SequentialActGradReducer::enqueueReduction(int deviceID) {
- _messageQueues[deviceID]->enqueue(1);
-}
-
-void SequentialActGradReducer::stop() {
- for(map<int,Queue<int>* >::const_iterator it = _messageQueues.begin(); it != _messageQueues.end(); ++it) {
- it->second->enqueue(ACT_GRAD_REDUCER_EXIT);
- }
- join();
-}
-
-/* =====================
- * ParallelActGradReducer
- * =====================
- */
-ParallelActGradReducer::ParallelActGradReducer(Layer& parent, map<int, int> numExpectedMsgs)
- : IActGradReducer(parent, numExpectedMsgs), _numReceivedMsgs(0) {
- _reducer = &(new EightGPUReducer1(parent.getDeviceID()))->construct();
-
- _scaleTarget = numExpectedMsgs.count(parent.getDeviceID()) > 0 && numExpectedMsgs[parent.getDeviceID()] > 0;
-}
-
-bool ParallelActGradReducer::reduce() {
- // TODO: make it so that you can start the reduction before you've received all the messages.
- while(_numReceivedMsgs < _numExpectedMsgsTotal) {
- _numReceivedMsgs += _messageQueue.dequeue();
- }
- if (_numReceivedMsgs > _numExpectedMsgsTotal) {
- return true; // exit
- }
- map<int,NVMatrix*> mats = _parent->getAllActsGrads();
- _reducer->reduce(mats, 1, _scaleTarget);
- assert(_messageQueue.getNumElements() == 0);
- return false;
-
-}
-
-void ParallelActGradReducer::enqueueReduction(int deviceID) {
- _messageQueue.enqueue(1);
-}
-
-void ParallelActGradReducer::stop() {
- _messageQueue.enqueue(ACT_GRAD_REDUCER_EXIT);
- join();
-}
-
-void ParallelActGradReducer::reset() {
- _numReceivedMsgs = 0;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/jpeg.h"
-
-using namespace std;
-
-/* ========================
- * DecoderThread
- * ========================
- */
-DecoderThread::DecoderThread(PyObject* pyList, Matrix& target, int start_img, int end_img, int img_size, int inner_size, bool test, bool multiview)
-: Thread(true), _pyList(pyList), _target(&target), _start_img(start_img), _end_img(end_img),
- _img_size(img_size), _inner_size(inner_size), _test(test), _multiview(multiview),
- _decodeTarget(0), _decodeTargetSize(0) {
-
- _inner_pixels = _inner_size * _inner_size;
- _rseed = time(0);
-}
-
-DecoderThread::~DecoderThread(){
- free(_decodeTarget);
-}
-
-void* DecoderThread::run() {
- int numSrcCases = PyList_GET_SIZE(_pyList);
- assert(_target->getNumCols() == _inner_pixels * 3);
- assert(_target->getNumRows() == PyList_GET_SIZE(_pyList) * (_multiview ? 10 : 1));
-
- int width, height;
-
- for (int64 i = _start_img; i < _end_img; ++i) {
- decodeJpeg(i, width, height);
- assert((width == _img_size && height >= _img_size)
- || (height == _img_size && width >= _img_size));
- if (_multiview) {
- for (int flip = 0; flip < 2; ++flip) {
- crop(numSrcCases * (flip * 5 + 0) + i, width, height, flip, 0, 0); // top-left
- crop(numSrcCases * (flip * 5 + 1) + i, width, height, flip, width - _inner_size, 0); // top-right
- crop(numSrcCases * (flip * 5 + 2) + i, width, height, flip, (width - _inner_size) / 2, (height - _inner_size) / 2); // center
- crop(numSrcCases * (flip * 5 + 3) + i, width, height, flip, 0, height - _inner_size); // bottom-left
- crop(numSrcCases * (flip * 5 + 4) + i, width, height, flip, width - _inner_size, height - _inner_size); // bottom-right
- }
- } else {
- crop(i, width, height, !_test && (rand_r(&_rseed) % 2));
- }
-
- }
- return NULL;
-}
-
-void DecoderThread::decodeJpeg(int idx, int& width, int& height) {
- PyObject* pySrc = PyList_GET_ITEM(_pyList, idx);
- unsigned char* src = (unsigned char*)PyString_AsString(pySrc);
- size_t src_len = PyString_GET_SIZE(pySrc);
-
- struct jpeg_decompress_struct cinf;
- struct jpeg_error_mgr jerr;
- cinf.err = jpeg_std_error(&jerr);
- jpeg_create_decompress(&cinf);
- jpeg_mem_src(&cinf, src, src_len);
- assert(jpeg_read_header(&cinf, TRUE));
- cinf.out_color_space = JCS_RGB;
- assert(jpeg_start_decompress(&cinf));
- assert(cinf.num_components == 3 || cinf.num_components == 1);
- width = cinf.image_width;
- height = cinf.image_height;
-
- if (_decodeTargetSize < width * height * 3) {
- free(_decodeTarget);
- _decodeTargetSize = width * height * 3 * 3;
- _decodeTarget = (unsigned char*)malloc(_decodeTargetSize);
- }
-
- while (cinf.output_scanline < cinf.output_height) {
- JSAMPROW tmp = &_decodeTarget[width * cinf.out_color_components * cinf.output_scanline];
- assert(jpeg_read_scanlines(&cinf, &tmp, 1) > 0);
- }
- assert(jpeg_finish_decompress(&cinf));
- jpeg_destroy_decompress(&cinf);
-}
-
-/*
- * Uniform in [0,1)
- */
-inline double DecoderThread::randUniform() {
- return double(rand_r(&_rseed)) / (int64(RAND_MAX) + 1);
-}
-
-/*
- * Uniform in [min, max)
- */
-inline double DecoderThread::randUniform(double min, double max) {
- return (max - min) * randUniform() + min;
-}
-
-void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip) {
- crop(i, src_width, src_height, flip, -1, -1);
-}
-
-void DecoderThread::crop(int64 i, int64 src_width, int64 src_height, bool flip, int64 crop_start_x, int64 crop_start_y) {
- const int64 border_size_y = src_height - _inner_size;
- const int64 border_size_x = src_width - _inner_size;
- if (crop_start_x < 0) {
- crop_start_x = _test ? (border_size_x / 2) : (rand_r(&_rseed) % (border_size_x + 1));
- }
- if (crop_start_y < 0) {
- crop_start_y = _test ? (border_size_y / 2) : (rand_r(&_rseed) % (border_size_y + 1));
- }
- const int64 src_pixels = src_width * src_height;
- for (int64 c = 0; c < 3; ++c) {
- for (int64 y = crop_start_y; y < crop_start_y + _inner_size; ++y) {
- for (int64 x = crop_start_x; x < crop_start_x + _inner_size; ++x) {
- assert((y >= 0 && y < src_height && x >= 0 && x < src_width));
- _target->getCell(i, c * _inner_pixels + (y - crop_start_y) * _inner_size
- + (flip ? (_inner_size - 1 - x + crop_start_x)
- : (x - crop_start_x)))
- = _decodeTarget[3 * (y * src_width + x) + c];
- }
- }
- }
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <helper_cuda.h>
-#include <iostream>
-#include <set>
-#include "../../cudaconv3/include/cudaconv2.cuh"
-#include "../../util/include/matrix.h"
-#include "../include/layer_kernels.cuh"
-#include "../include/layer.cuh"
-#include "../include/data.cuh"
-#include "../include/util.cuh"
-#include "../include/weights.cuh"
-
-using namespace std;
-
-/*
- * =======================
- * Layer
- * =======================
- */
-Layer::Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans) :
- _convNetThread(convNetThread), _replicaID(replicaID), _trans(trans) {
- _name = pyDictGetString(paramsDict, "name");
- _type = pyDictGetString(paramsDict, "type");
-
- _foundGradConsumers = false;
- _gradConsumer = pyDictGetInt(paramsDict, "gradConsumer");
- _actsTarget = pyDictGetInt(paramsDict, "actsTarget");
- _actsGradTarget = pyDictGetInt(paramsDict, "actsGradTarget");
- _numOutputs = pyDictGetInt(paramsDict, "outputs");
- _numReplicas = pyDictGetInt(paramsDict, "numReplicas");
- _numReplicasPrev = 1;
- _rcvdBInputMsgs = 0;
-
- _actBroadcaster = NULL;
- _gradReducer = NULL;
- _initialized = false;
-}
-
-Layer::~Layer() {
- if (_actBroadcaster != NULL) {
- _actBroadcaster->stop();
- delete _actBroadcaster;
- }
- if (_gradReducer != NULL) {
- _gradReducer->stop();
- delete _gradReducer;
- }
- // For now, gradReducer doesn't have a destructor
-// delete _gradReducer;
- for (std::map<int, MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
- if (it->second->getMemorySource().truncate(_name)) {
- delete &it->second->getMemorySource();
- }
- }
- for (std::map<int, MemoryView*>::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
- if (it->second->getMemorySource().truncate(_name)) {
- delete &it->second->getMemorySource();
- }
- }
-}
-
-cudaStream_t Layer::getStream() {
- assert(getDeviceID() >= 0);
- return NVMatrix::getDefaultStream(getDeviceID());
-}
-
-void Layer::syncStream() {
- NVMatrix::syncStream(getStream());
-}
-
-void Layer::fpropNext(PASS_TYPE passType, int passIdx) {
- if (_next.size() > 0) {
- if (getFwdActiveReplicaIdx(passIdx) == 0/*getReplicaIdx()*/) { // 0 turns on pipelining
- if (_nextDeviceIDs.size() > 1 || (_nextDeviceIDs.size() == 1 && _nextDeviceIDs[0] != getDeviceID())) {
- syncStream(); // Make sure I've finished computing before broadcasting
- }
- getActBroadcaster().getMessageQueue().enqueue(new BroadcastMessage(getAllActs(), getDeviceID(), getReplicaIdx(), _broadcastFinishQueue));
- }
- if (getFwdActiveReplicaIdx(passIdx) == getReplicaIdx()) {
- _broadcastFinishQueue.dequeue();
- assert(_broadcastFinishQueue.getNumElements() == 0);
- }
- }
-
- for (int i = 0; i < _next.size(); i++) {
- _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx));
- }
-}
-
-bool Layer::fprop(PASS_TYPE passType, int passIdx) {
- _rcvdFInputMsgs++;
- // I require messages from *all* input replicas because it makes the propagation easier to think about.
- // Without this requirement, when all fprop terminal msgs arrive to ConvNet, the forward propagation
- // might not actually be finished yet.
- if (_rcvdFInputMsgs == getNumExpectedFwdMsgs()) {
-// printf("Layer %s[%d] fprop\n", _name.c_str(), getReplicaID());
- int ridx = getFwdActiveInputReplicaIdx(passIdx);
- assert(getDeviceID() == NVMatrix::getDeviceID());
- map<int, NVMatrix*> v;
- if (ridx >= 0) {
- for (int i = 0; i < getNumLayersPrev(); i++) {
- v[i] = &_prev[ridx][i]->getActs(getDeviceID());
- }
- }
- fprop(v, passType, passIdx);
- return true;
- }
- return false;
-}
-
-void Layer::fprop(map<int,NVMatrix*>& v, PASS_TYPE passType, int passIdx) {
- if (getFwdActiveInputReplicaIdx(passIdx) >= 0) {
- assert(v.size() == getNumLayersPrev());
- _inputs.clear();
- _inputs.insert(v.begin(), v.end());
-
- int numCases = _inputs[0]->getLeadingDim();
- for (map<int,MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
- it->second->getMemory(numCases);
- }
-
- if (numCases > 0) {
- //printf("layer %s fprop, numcases: %d\n", _name.c_str(), numCases);
- _rcvdFInputMsgs = getNumExpectedFwdMsgs();
- for (map<int,NVMatrix*>::iterator it = v.begin(); it != v.end(); ++it) {
- it->second->transpose(_trans);
- }
- getActs().transpose(_trans);
-
- fpropCommon(passType);
-
- // First do fprop on the input whose acts matrix I'm sharing, if any
- if (_actsTarget >= 0) {
- fpropActs(_actsTarget, 0, passType, passIdx);
- }
- // Then add the rest of the inputs to that
- for (int i = 0; i < getNumLayersPrev(); i++) {
- if (i != _actsTarget) {
- fpropActs(i, _actsTarget >= 0 || i > 0, passType, passIdx);
- }
- }
- }
- }
- fpropNext(passType, passIdx);
-}
-
-void Layer::truncBwdActs() {
- // Only truncate actsGrad if I own it
- if (_actsGradTarget < 0) {
- for (map<int,MemoryView*>::iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
- it->second->getMemorySource().truncate(getName());
- }
- }
- if (_actsTarget < 0) {
- for (map<int,MemoryView*>::iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
- it->second->getMemorySource().truncate(getName());
- }
- }
-}
-
-int Layer::getNumGradProducersNext() {
- return _numGradProducersNext;
-}
-
-int Layer::getNumExpectedBwdMsgs() {
- return _numGradProducersNext * getNumSiblingReplicas();
-}
-
-int Layer::getNumExpectedFwdMsgs() {
- return getNumLayersPrev() * getNumInputReplicas();
-}
-
-void Layer::bprop(PASS_TYPE passType, int passIdx) {
- if (getBwdActiveInputReplicaIdx(passIdx) >= 0 && _rcvdBInputMsgs == getNumExpectedBwdMsgs()) {
-// printf("Layer %s[%d] bprop\n", _name.c_str(), getReplicaID());
- if (_gradReducer != NULL) {
- _gradReducer->waitForFinish();
- }
-
- // This does sync, but only if it has grad consumers below! so we must sync again before sending bprop terminal messages
- bprop(getActsGrad(), passType, passIdx);
-
- if (_bwdTerminal[passIdx]) {
- syncStream();
- getConvNet().getMessageQueue().enqueue(new Message(BPROP_TERMINAL));
- }
- }
-}
-
-void Layer::bpropActsCall(NVMatrix& v, PASS_TYPE passType, int replicaIdx, int inputIdx) {
- Layer& prev = *_prev[replicaIdx][inputIdx];
- if (prev.isGradConsumer() && isGradProducer(prev.getName())) {
- if (v.getLeadingDim() > 0) { // Only do computation if #cases > 0
- bpropActs(v, replicaIdx, inputIdx, prev.getNumComputedActsGrads(getDeviceID()) > 0, passType);
- }
- prev.getNumComputedActsGrads(getDeviceID())++;
- // Synchronize if the previous layer is going to actually do a reduction.
- // If the previous layer is on the same GPU as us and has no next layers
- // on other GPUs then it won't need to do a reduction.
- if (prev.getNextDeviceIDs().size() > 1 || (prev.getNextDeviceIDs().size() == 1 && getDeviceID() != prev.getDeviceID())) {
- syncStream();
- }
- prev.getGradReducer().enqueueReduction(getDeviceID());
- }
-}
-
-void Layer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) {
-
- v.transpose(_trans);
- assert(getDeviceID() == NVMatrix::getDeviceID());
- int ridx = getBwdActiveInputReplicaIdx(passIdx);
- LayerV& prev = _prev[ridx];
- map<int, set<Layer*> > prevByDevice = _prevByDevice[ridx];
-
- for (int i = 0; i < prev.size(); i++) {
- _inputs[i]->transpose(_trans);
- prev[i]->getActsGrad().transpose(_trans);
- }
- getActs().transpose(_trans);
- // NOTE: this should be here (before the bpropActs) because if you have a layer
- // that has a weight matrix AND actsGradTarget >= 0, then the stuff below will overwrite
- // v which is used in bpropCommon. So bpropCommon must come first.
- bpropCommon(v, ridx, passType);
-
- if (isGradProducer()) {
- // First propagate activity gradient to all layers whose activity
- // gradient matrix I'm definitely not sharing.
- for (map<int, set<Layer*> >::const_iterator it = prevByDevice.begin(); it != prevByDevice.end(); ++it) {
- const set<Layer*>& deviceLayers = it->second;
- for (set<Layer*>::const_iterator it2 = deviceLayers.begin(); it2 != deviceLayers.end(); ++it2) {
- if (_actsGradTarget != (*it2)->getInputIdx(_name)) {
- bpropActsCall(v, passType, ridx, (*it2)->getInputIdx(_name));
- }
- }
- }
-
- // Then propagate activity gradient to the layer whose activity gradient
- // matrix I'm sharing, if any.
- if (_actsGradTarget >= 0) {
- bpropActsCall(v, passType, ridx, _actsGradTarget);
- }
- }
-
- // Synchronization is necessary because the kernel calls that compute my backward acts
- // execute asynchronously. Therefore I don't want to tell other threads that I've
- // computed bprop activities for them when in fact I've only called a function which
- // will eventually compute them.
- if (_prevDeviceIDs.size() > 1 || (_prevDeviceIDs.size() == 1 && _prevDeviceIDs[0] != getDeviceID())) {
- syncStream();
- }
-
- if (getConvNet().isConserveMemory()) {
- truncBwdActs();
- }
-
- if (isGradProducer()) {
- /*for (int i = 0; i < prev.size(); i++) {
- if (prev[i]->isGradConsumer() && isGradProducer(prev[i]->getName())) {
- prev[i]->getGradReducer().enqueueReduction(getDeviceID());
- }
- }*/
-
- // Send backward messages to *all* replicas.
- // Note that the messages will be dismissed unless the passIdx indicates
- // that the previous layer should do some work.
- for (int r = 0; r < getNumInputReplicas(); r++) {
- for (int i = 0; i < _prev[r].size(); i++) {
- if (_prev[r][i]->isGradConsumer() && isGradProducer(_prev[r][i]->getName())) {
- _prev[r][i]->getConvNetThread().getMessageQueue().enqueue(new BpropMessage(*_prev[r][i], passType, passIdx));
- }
- }
- }
- }
-}
-
-IActGradReducer& Layer::getGradReducer() {
- return *_gradReducer;
-}
-
-// This is called between minibatches
-void Layer::reset() {
- _rcvdFInputMsgs = 0;
- _rcvdBInputMsgs = 0;
- for (map<int,int>::iterator it = _numComputedActsGrads.begin(); it != _numComputedActsGrads.end(); ++it) {
- it->second = 0;
- }
-}
-
-// This is called between microbatches
-void Layer::resetPassIdx() {
- _rcvdFInputMsgs = 0;
- if (_rcvdBInputMsgs >= getNumExpectedBwdMsgs()) {
- reset();
- }
-}
-
-/*
- * Returns number of cases in given matrix.
- */
-int Layer::getNumCases(NVMatrix& v) {
- return v.getLeadingDim();
-}
-
-int Layer::incRcvdBInputMsgs() {
- return ++_rcvdBInputMsgs;
-}
-
-std::string& Layer::getName() {
- return _name;
-}
-
-std::string& Layer::getType() {
- return _type;
-}
-
-int& Layer::getNumComputedActsGrads(int deviceID) {
- return _numComputedActsGrads[deviceID];
-}
-
-void Layer::addNext(Layer& l) {
- _next.push_back(&l);
- _numReplicasNext = l.getNumReplicas();
- if (count(_nextDeviceIDs.begin(), _nextDeviceIDs.end(), l.getDeviceID()) == 0) {
- int pos = rand() % (_nextDeviceIDs.size() + 1);
- _nextDeviceIDs.insert(_nextDeviceIDs.begin() + pos, l.getDeviceID());
- }
-}
-
-void Layer::addPrev(Layer& l, int replicaIdx) {
- _prev[replicaIdx].push_back(&l);
- _numReplicasPrev = l.getNumReplicas();
- l.setInputIdx(getName(), _prev[replicaIdx].size() - 1);
- if (l.getDeviceID() >= 0 && count(_prevDeviceIDs.begin(), _prevDeviceIDs.end(), l.getDeviceID()) == 0) {
- int pos = rand() % (_prevDeviceIDs.size() + 1);
- _prevDeviceIDs.insert(_prevDeviceIDs.begin() + pos, l.getDeviceID());
- }
-}
-
-void Layer::addReplica(Layer& l) {
- assert(_replicas.count(l.getReplicaID()) == 0);
- _replicas[l.getReplicaID()] = &l;
-}
-
-bool Layer::hasGradProducerNext(std::string& layerName) {
- bool b = _next.size() == 0;
- for (int i = 0; i < _next.size(); i++) {
- b |= _next[i]->hasGradProducerNext(_name);
- }
- return b && isGradProducer(layerName);
-}
-
-bool Layer::postInit() {
- // We choose not to populate _outputs[getDeviceID()] here because we do it instead in fprop().
- // In fprop(), we can populate it from the _inputs vector, which is a bit more general than populating
- // it from _prev->getActs()
-// _outputs = _actsTarget < 0 ? new NVMatrix() : &_prev[_actsTarget]->getActs();
- if (!_initialized) {
- _initialized = true;
- map<int,int> numGradProducersNext;
- _numGradProducersNext = 0;
- for (int r = 0; r < getNumInputReplicas(); ++r) {
- for (vector<Layer*>::const_iterator it = _prev[r].begin(); it != _prev[r].end(); ++it) {
- (*it)->postInit();
- }
- }
-
- _memSrcActs[getDeviceID()] = _actsTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName())
- : &_prev[0][_actsTarget]->getMemorySourceActs(getDeviceID()).clone(_name);
-
- // _actsGradTarget will only be >= 0 when the number of replicas is the same in both layers, so this justifies the use of _prev[0]
-
- _memSrcActsGrad[getDeviceID()] = _actsGradTarget < 0 ? &MemorySource::make(_numOutputs, getDeviceID(), getName())
- : &_prev[0][_actsGradTarget]->getMemorySourceActsGrad(getDeviceID()).clone(_name);
- for (int i = 0; i < _next.size(); ++i) {
- int d = _next[i]->getDeviceID();
- _numComputedActsGrads[d] = 0;
- if (_next[i]->hasGradProducerNext(_name)) {
- if (numGradProducersNext.count(d) == 0) {
- numGradProducersNext[d] = 0;
- }
- numGradProducersNext[d]++;
- _numGradProducersNext++;
- if (_memSrcActsGrad.count(d) == 0) {
- _memSrcActsGrad[d] = &MemorySource::make(_numOutputs, d, getName());
- }
- }
- if (_memSrcActs.count(d) == 0) {
- _memSrcActs[d] = &MemorySource::make(_numOutputs, d, getName());
- }
- }
-
- if (_next.size() == 0) {
- _numReplicasNext = getNumReplicas();
- }
-
- /*
- * Initialize forward broadcaster. First sibling owns it.
- */
- if (getReplicaIdx() == 0 && _convNetThread != NULL) {
- _actBroadcaster = new ActBroadcaster(getNumSiblingReplicas(), getDeviceCPUs(_convNetThread->getDeviceID()));
- _actBroadcaster->start();
- }
-
- /*
- * Initialize backward reducer.
- */
- if (isGradConsumer() && _numGradProducersNext > 0) {
- _gradReducer = &IActGradReducer::makeGradReducer(*this, numGradProducersNext);
- _gradReducer->start();
- }
-
- /*
- * Initialize specially sorted previous array
- */
- for (int r = 0; r < _prev.size(); ++r) {
- for (int i = 0; i < _prev[r].size(); ++i) {
- // Previous devices in reverse order of processing by (sequential) GradReducer
- _prevByDevice[r][getDeviceID() - _prev[r][i]->getDeviceID()
- + 16 * (_prev[r][i]->getDeviceID() > getDeviceID())].insert(_prev[r][i]);
-
- }
- }
- return true;
- }
- return false;
-}
-
-ActBroadcaster& Layer::getActBroadcaster() {
- return getReplicaIdx() == 0 ? *_actBroadcaster : _replicas[getReplicaID() - getReplicaIdx()]->getActBroadcaster();
-}
-
-// Does this layer, or some layer below it, need the gradient
-// for parameter updates?
-// Only weight layers should be grad consumers themselves.
-bool Layer::isGradConsumer() {
- if (!_foundGradConsumers && _prev.size() > 0) {
- for (int i = 0; i < _prev[0].size(); i++) {
- _gradConsumer |= _prev[0][i]->isGradConsumer();
- }
- _foundGradConsumers = true;
- }
- return _gradConsumer;
-}
-
-// Does this layer produce gradient for layers below?
-bool Layer::isGradProducer() {
- return true;
-}
-
-bool Layer::isGradProducer(std::string& layerName) {
- return isGradProducer();
-}
-
-map<int,vector<Layer*> >& Layer::getPrev() {
- return _prev;
-}
-
-vector<Layer*>& Layer::getNext() {
- return _next;
-}
-
-NVMatrix& Layer::getActs() {
- return getActs(getDeviceID());
-}
-
-NVMatrix& Layer::getActs(int deviceID) {
- assert(_memSrcActs.count(deviceID) > 0);
- return _memSrcActs[deviceID]->getMemory();
-}
-
-NVMatrix& Layer::getActs(int deviceID, int numCases) {
- assert(_memSrcActs.count(deviceID) > 0);
- return _memSrcActs[deviceID]->getMemory(numCases);
-}
-
-NVMatrix& Layer::getActsGrad(int deviceID) {
- assert(_memSrcActsGrad.count(deviceID) > 0);
- return _memSrcActsGrad[deviceID]->getMemory(getActs(deviceID).getLeadingDim());
-}
-
-NVMatrix& Layer::getActsGrad() {
- return getActsGrad(NVMatrix::getDeviceID());
-}
-
-map<int, NVMatrix*> Layer::getAllActs() {
- map<int, NVMatrix*> m;
- for (map<int, MemoryView*>::const_iterator it = _memSrcActs.begin(); it != _memSrcActs.end(); ++it) {
- m[it->first] = &it->second->getMemory();
- }
- return m;
-}
-
-map<int, NVMatrix*> Layer::getAllActsGrads() {
- map<int, NVMatrix*> m;
- for (map<int, MemoryView*>::const_iterator it = _memSrcActsGrad.begin(); it != _memSrcActsGrad.end(); ++it) {
- m[it->first] = &it->second->getMemory();
- }
- return m;
-}
-
-int Layer::getDeviceID() {
- return _convNetThread == NULL ? -1 : _convNetThread->getDeviceID();
-}
-
-ConvNetThread& Layer::getConvNetThread() {
- assert(_convNetThread != NULL);
- return *_convNetThread;
-}
-
-ConvNet& Layer::getConvNet() {
- return getConvNetThread().getConvNet();
-}
-
-void Layer::setBwdTerminal(int passIdx) {
- _bwdTerminal[passIdx] = true;
-}
-
-int Layer::getReplicaID() {
- return _replicaID;
-}
-
-int Layer::getActivePassPeriod() {
- return getNumReplicas() / getConvNet().getNumReplicasMin();
-}
-
-int Layer::getFwdActiveInputReplicaIdx(int passIdx) {
- const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas();
- return passIdx % getActivePassPeriod() == 0 ? edge : -1;
-}
-
-int Layer::getBwdActiveInputReplicaIdx(int passIdx) {
- const int edge = (passIdx / getActivePassPeriod()) % getNumInputReplicas();
- return (passIdx + 1) % getActivePassPeriod() == 0 ? edge : -1;
-}
-
-int Layer::getFwdActiveReplicaIdx(int passIdx) {
- assert(_next.size() > 0);
- return _next[0]->getFwdActiveInputReplicaIdx(passIdx);
-}
-
-int Layer::getNumReplicas() {
- return _replicas.size();
-}
-
-int Layer::getNumSiblingReplicas() {
- return getNumReplicas() / getNumReplicasNext();
-}
-
-int Layer::getNumReplicasPrev() {
- return _numReplicasPrev;
-}
-
-int Layer::getNumReplicasNext() {
- return _numReplicasNext;
-}
-
-int Layer::getNumInputReplicas() {
- return _numReplicasPrev / getNumReplicas();
-}
-
-int Layer::getReplicaIdx() {
- return getReplicaID() % getNumSiblingReplicas();
-}
-
-int Layer::getNumLayersPrev() {
- return _prev.size() > 0 ? _prev[0].size() : 0;
-}
-
-void Layer::setMemorySourceActs(int deviceID, MemoryView& mem) {
- assert(_memSrcActs[deviceID]->isParent());
- delete _memSrcActs[deviceID];
- _memSrcActs[deviceID] = &mem;
- if (_actsTarget >= 0 && deviceID == getDeviceID()) {
- assert(getNumInputReplicas() == 1);
- _prev[0][_actsTarget]->setMemorySourceActs(deviceID, mem.clone(_prev[0][_actsTarget]->getName()));
- }
-}
-
-void Layer::setMemorySourceActsGrad(int deviceID, MemoryView& mem) {
- assert(_memSrcActsGrad[deviceID]->isParent());
- delete _memSrcActsGrad[deviceID];
- _memSrcActsGrad[deviceID] = &mem;
- if (_actsGradTarget >= 0 && deviceID == getDeviceID()) {
- assert(getNumInputReplicas() == 1);
- _prev[0][_actsGradTarget]->setMemorySourceActsGrad(deviceID, mem.clone(_prev[0][_actsGradTarget]->getName()));
- }
-}
-
-MemoryView& Layer::getMemorySourceActs(int deviceID) {
- return *_memSrcActs[deviceID];
-}
-
-MemoryView& Layer::getMemorySourceActsGrad(int deviceID) {
- return *_memSrcActsGrad[deviceID];
-}
-
-int Layer::getNumOutputs() {
- return _numOutputs;
-}
-
-void Layer::setInputIdx(std::string& parentName, int idx) {
- _inputIndices[parentName] = idx;
-}
-
-int Layer::getInputIdx(std::string& parentName) {
- return _inputIndices[parentName];
-}
-
-/*
- * =======================
- * NeuronLayer
- * =======================
- */
-NeuronLayer::NeuronLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : Layer(convNetThread, paramsDict, replicaID, true) {
- PyObject* neuronDict = PyDict_GetItemString(paramsDict, "neuron");
- _neuronType = pyDictGetString(neuronDict, "type");
- _neuron = &Neuron::makeNeuron(neuronDict);
-}
-
-NeuronLayer::~NeuronLayer() {
- delete _neuron;
-}
-
-void NeuronLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 0);
- if (!bpropSpecial(v, replicaIdx, inpIdx, scaleTargets, passType)) {
- _neuron->computeInputGrad(v, _prev[replicaIdx][0]->getActsGrad(), scaleTargets > 0);
- }
-}
-
-bool NeuronLayer::bpropSpecial(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- // Special optimization for cross-entropy objective with logistic units.
- // Better to just compute the input gradient in one go to avoid division by small numbers.
- bool doCrossEntGrad = _neuronType == "logistic" && _next.size() == 1
- && (_next[0]->getType() == "cost.bce" || _next[0]->getType() == "cost.dce")
- && _next[0]->getDeviceID() == getDeviceID()
- && _next[0]->getNumReplicas() == getNumReplicas();
- LayerV& prev = _prev[replicaIdx];
- if (doCrossEntGrad) {
- NVMatrix& labels = _next[0]->getPrev()[replicaIdx][0]->getActs(getDeviceID());
- BinomialCrossEntropyCostLayer& cost = *static_cast<BinomialCrossEntropyCostLayer*>(_next[0]);
- float gradCoeff = cost.getCoeff();
- labels.transpose(_trans);
- if (cost.getPosWeight() == 1) {
- if (scaleTargets == 0) {
- getActs().add(labels, -gradCoeff, gradCoeff, prev[0]->getActsGrad());
- } else {
- getActs().applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::WeightedAdd>(NVMatrixBinaryOps::WeightedAdd(-gradCoeff, gradCoeff)),
- labels, prev[0]->getActsGrad(), prev[0]->getActsGrad());
- }
- } else {
- if (scaleTargets == 0) {
- getActs().applyBinary(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight()), labels, prev[0]->getActsGrad());
- } else {
- getActs().applyTernary(AddGradientBinaryOperator<CrossEntLogisticGradientOperator>(CrossEntLogisticGradientOperator(gradCoeff, cost.getPosWeight())),
- labels, prev[0]->getActsGrad(), prev[0]->getActsGrad());
- }
- }
- }
- return doCrossEntGrad;
-}
-
-void NeuronLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- _neuron->activate(*_inputs[0], getActs());
-}
-
-std::string& NeuronLayer::getNeuronType() {
- return _neuronType;
-}
-
-/*
- * =======================
- * WeightLayer
- * =======================
- *
- * The useGrad parameter here merely expresses a preference by the subclass. It may
- * be overridden by the superclass (WeightLayer) and in that case the subclass must follow its wishes.
- * So when computing gradient updates, the subclass must always first check weights.isUseGrad().
- *
- * Note: biases always useGrad.
- */
-WeightLayer::WeightLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans, bool useGrad) :
- Layer(convNetThread, paramsDict, replicaID, trans) {
- _weightUpdatePassPeriod = pyDictGetInt(paramsDict, "updatePeriod");
-
- MatrixV& hWeights = *pyDictGetMatrixV(paramsDict, "weights");
- MatrixV& hWeightsInc = *pyDictGetMatrixV(paramsDict, "weightsInc");
- Matrix& hBiases = *pyDictGetMatrix(paramsDict, "biases");
- Matrix& hBiasesInc = *pyDictGetMatrix(paramsDict, "biasesInc");
- PyObject* pyEpsWList = PyDict_GetItemString(paramsDict, "epsW");
- PyObject* pyEpsB = PyDict_GetItemString(paramsDict, "epsB");
- floatv& momW = *pyDictGetFloatV(paramsDict, "momW");
- float momB = pyDictGetFloat(paramsDict, "momB");
- floatv& wc = *pyDictGetFloatV(paramsDict, "wc");
- floatv& wball = *pyDictGetFloatV(paramsDict, "wballNormed");
-
- /*
- * When there are multiple replicas, the present implementation
- * requires that useGrad is true. This is because weights.update()
- * performs a simultaneous write to both replicas' weightsInc matrix,
- * which means that the read should come from somewhere else (i.e. a
- * grads matrix).
- */
- useGrad |= _numReplicas > 1;
-
- // Source layers for shared weights
- stringv& weightSourceLayers = *pyDictGetStringV(paramsDict, "weightSourceLayers");
-
- // Weight matrix indices (inside the above source layers) for shared weights
- intv& weightSourceMatrixIndices = *pyDictGetIntV(paramsDict, "weightSourceMatrixIndices");
- _weights = new WeightList();
- for (int i = 0; i < weightSourceLayers.size(); i++) {
- std::string& srcLayerName = weightSourceLayers[i];
- int matrixIdx = weightSourceMatrixIndices[i];
- PyObject* pyEpsW = PyList_GetItem(pyEpsWList, i);
- ParameterSchedule& lrs = ParameterSchedule::make(pyEpsW); // Learning rate schedule
- if (srcLayerName == _name) { // Current layer
- _weights->addWeights(*new Weights(_weights->at(matrixIdx), lrs, *this));
- } else if (srcLayerName != "") {
- WeightLayer& srcLayer = *static_cast<WeightLayer*>(&convNetThread->getLayer(srcLayerName));
- Weights* srcWeights = &srcLayer.getWeights(matrixIdx);
- _weights->addWeights(*new Weights(*srcWeights, lrs, *this));
- } else {
- _weights->addWeights(*new Weights(*hWeights[i], *hWeightsInc[i], lrs, *this, wc[i], wball[i], momW[i], useGrad));
- }
- }
- _biases = new Weights(hBiases, hBiasesInc, ParameterSchedule::make(pyEpsB), *this, 0, 0, momB, true);
-
- delete &weightSourceLayers;
- delete &weightSourceMatrixIndices;
- delete &hWeights;
- delete &hWeightsInc;
- delete &momW;
- delete &wc;
- delete &wball;
-
- _wStep = 0.02;
- _bStep = 0.005;
-}
-
-WeightLayer::~WeightLayer() {
- delete _weights;
- delete _biases;
-}
-
-bool WeightLayer::postInit() {
- if (Layer::postInit()) {
- _weightUpdatePassPeriod = max(_weightUpdatePassPeriod, getActivePassPeriod());
- assert(_weightUpdatePassPeriod % getActivePassPeriod() == 0);
- return true;
- }
- return false;
-}
-
-void WeightLayer::fpropCommon(PASS_TYPE passType) {
-}
-
-void WeightLayer::bpropCommon(NVMatrix& v, int replicaIdx, PASS_TYPE passType) {
- if (_biases->getLearningRateSchedule().getBaseValue() > 0) {
- if (v.getNumElements() > 0) {
- bpropBiases(v, passType);
- } else {
- _biases->getGrad().resize(_biases->getW());
- _biases->getGrad().scale(getBIncScale());
- }
- _biases->incNumUpdates();
- }
- for (int i = 0; i < _weights->getSize(); i++) {
- if (_weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
- if (v.getNumElements() > 0) {
- bpropWeights(v, replicaIdx, i, passType);
- } else {
- _weights->at(i).getGrad().resize(_weights->at(i).getW());
- // This will cause it to forget momentum when shown 0 training cases
- // and _useGrad = false but it's not too important.
- _weights->at(i).getGrad().scale(getIncScale(i, passType));
- }
- // Increment its number of updates
- _weights->at(i).incNumUpdates();
- }
- }
-}
-
-bool WeightLayer::updateWeights() {
- if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) {
- _weights->update(getConvNet().getTrainingProgress());
- _biases->update(getConvNet().getTrainingProgress());
-// constrainWeights();
- return true;
- }
- return false;
-}
-
-bool WeightLayer::constrainWeights() {
- if (getConvNet().getTotalPassesDone() % _weightUpdatePassPeriod == 0) {
- _constrainWeights();
- return true;
- }
- return false;
-}
-
-void WeightLayer::_constrainWeights() {
-}
-
-void WeightLayer::copyToCPU() {
- _weights->copyToCPU();
- _biases->copyToCPU();
-}
-
-void WeightLayer::copyToGPU() {
- _weights->copyToGPU();
- _biases->copyToGPU();
-}
-
-void WeightLayer::checkGradient() {
- for (int i = 0; i < _weights->getSize(); i++) {
- getConvNet().checkGradient(_name + " weights[" + tostr(i) + "]", _wStep, _weights->at(i));
- }
- getConvNet().checkGradient(_name + " biases", _bStep, *_biases);
-}
-
-void WeightLayer::addReplica(Layer& l) {
- Layer::addReplica(l);
- _weights->addReplica(*static_cast<WeightLayer*>(&l)->_weights);
- _biases->addReplica(*static_cast<WeightLayer*>(&l)->_biases);
-}
-
-Weights& WeightLayer::getWeights(int idx) {
- return _weights->at(idx);
-}
-
-float WeightLayer::getGradScale(int inpIdx, PASS_TYPE passType) {
- // weight update period must be multiple of activation period
- // TODO: simply accumulate # of cases seen between weight updates. simpler and more accurate.
- double numCases = _weightUpdatePassPeriod * (getConvNet().getMinibatchSize() / double(getConvNet().getNumPasses()));
- if (_weights->at(inpIdx).isUseGrad()) {
- return passType == PASS_GC ? 1.0f : 1.0f / numCases;
- }
- return passType == PASS_GC ? 1.0f : _weights->at(inpIdx).getEps(getConvNet().getTrainingProgress()) / numCases;
-}
-
-float WeightLayer::getIncScale(int inpIdx, PASS_TYPE passType) {
- if (_weights->at(inpIdx).isUseGrad()) {
- return _weights->at(inpIdx).getNumUpdates() > 0;
- }
- return (passType == PASS_GC ? _weights->at(inpIdx).getNumUpdates() > 0
- : (_weights->at(inpIdx).getNumUpdates() == 0 ? _weights->at(inpIdx).getMom() : 1.0f));
-}
-
-NVMatrix& WeightLayer::getGradTarget(int inpIdx) {
- return _weights->at(inpIdx).getGrad();
-}
-
-float WeightLayer::getBGradScale(PASS_TYPE passType) {
- int numCases = _weightUpdatePassPeriod * DIVUP(getConvNet().getMinibatchSize(), getConvNet().getNumPasses());
- return passType == PASS_GC ? 1.0f : 1.0f / numCases;
-}
-
-float WeightLayer::getBIncScale() {
- return _biases->getNumUpdates() > 0;
-}
-
-NVMatrix& WeightLayer::getWeightMatrix(PASS_TYPE passType, int inpIdx) {
- return _weights->at(inpIdx).getW();
-}
-
-NVMatrix& WeightLayer::getBiasMatrix(PASS_TYPE passType) {
- return _biases->getW();
-}
-
-/*
- * =======================
- * FCLayer
- * =======================
- */
-FCLayer::FCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
- : WeightLayer(convNetThread, paramsDict, replicaID, true, useGrad) {
- _wStep = 0.01;
- _bStep = 0.01;
-}
-
-void FCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- getActs().addProduct(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), scaleTargets, 1);
- if (scaleTargets == 0) {
- getActs().addVector(getBiasMatrix(passType), 1, getActs());
- }
-}
-
-void FCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose();
- _prev[replicaIdx][inpIdx]->getActsGrad().addProduct(v, weights_T, scaleTargets, 1);
- delete &weights_T;
-}
-
-void FCLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
- _biases->getGrad().addSum(v, 0, getBIncScale(), getBGradScale(passType));
-}
-
-void FCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
- NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose();
- float scaleGrad = getGradScale(inpIdx, passType);
- float scaleInc = getIncScale(inpIdx, passType);
- getGradTarget(inpIdx).addProduct(prevActs_T, v, scaleInc, scaleGrad);
- delete &prevActs_T;
-}
-
-void FCLayer::_constrainWeights() {
- for (int i = 0; i < _weights->getSize(); i++) {
- if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
-// NVMatrix norm2; // Unfortunate extra weight matrix...
- _weights->at(i).getW().sumOfSquares(0, _norm2);
-// norm2.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall()));
- _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall()));
- _weights->at(i).getW().eltwiseMultByVector(_norm2);
- }
- }
-}
-
-/*
- * =======================
- * SplitFCLayer
- * =======================
- */
-SplitFCLayer::SplitFCLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
- : FCLayer(convNetThread, paramsDict, replicaID, useGrad) {
- _numParts = pyDictGetInt(paramsDict, "parts");
-}
-
-void SplitFCLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- getActs().resize(_inputs[inpIdx]->getNumRows(), _numOutputs, true);
- NVMatrixV& splitInput = _inputs[inpIdx]->splitCols(_numParts);
- NVMatrixV& splitWeights = getWeightMatrix(passType, inpIdx).splitRows(_numParts);
- NVMatrixV& splitTarget = getActs().splitCols(_numParts);
-
- NVMatrix::batchedMatrixMultiply(splitInput, splitWeights, splitTarget, scaleTargets, 1);
- if (scaleTargets == 0) {
- getActs().addVector(getBiasMatrix(passType), 1, getActs());
- }
-
- deleteElements(splitInput, true);
- deleteElements(splitWeights, true);
- deleteElements(splitTarget, true);
-}
-
-void SplitFCLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- NVMatrix& weights_T = getWeightMatrix(passType, inpIdx).getTranspose();
- _prev[replicaIdx][inpIdx]->getActsGrad().resize(*_inputs[inpIdx]);
-
- NVMatrixV& splitV = v.splitCols(_numParts);
- NVMatrixV& splitWeights_T = weights_T.splitCols(_numParts);
- NVMatrixV& splitTarget = _prev[replicaIdx][inpIdx]->getActsGrad().splitCols(_numParts);
-
- NVMatrix::batchedMatrixMultiply(splitV, splitWeights_T, splitTarget, scaleTargets, 1);
-
- delete &weights_T;
- deleteElements(splitV, true);
- deleteElements(splitWeights_T, true);
- deleteElements(splitTarget, true);
-}
-
-void SplitFCLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
- NVMatrix& prevActs_T = _inputs[inpIdx]->getTranspose();
- NVMatrixV& splitPrevActs_T = prevActs_T.splitRows(_numParts);
- NVMatrixV& splitV = v.splitCols(_numParts);
- NVMatrixV& splitGradTarget = getGradTarget(inpIdx).splitRows(_numParts);
-
- NVMatrix::batchedMatrixMultiply(splitPrevActs_T, splitV, splitGradTarget, getIncScale(inpIdx, passType), getGradScale(inpIdx, passType));
-
- delete &prevActs_T;
- deleteElements(splitPrevActs_T, true);
- deleteElements(splitV, true);
- deleteElements(splitGradTarget, true);
-}
-
-/*
- * =======================
- * TwoDLayerInterface
- * =======================
- */
-TwoDLayerInterface::TwoDLayerInterface(PyObject* paramsDict) {
- _channels = pyDictGetInt(paramsDict, "channels");
- _imgSize = pyDictGetInt(paramsDict, "imgSize");
- _imgPixels = _imgSize * _imgSize;
-}
-
-/*
- * =======================
- * LocalLayer
- * =======================
- */
-LocalLayer::LocalLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool useGrad)
- : WeightLayer(convNetThread, paramsDict, replicaID, false, useGrad) {
- _padding = pyDictGetIntV(paramsDict, "padding");
- _stride = pyDictGetIntV(paramsDict, "stride");
- _filterSize = pyDictGetIntV(paramsDict, "filterSize");
- _channels = pyDictGetIntV(paramsDict, "channels");
- _imgSize = pyDictGetIntV(paramsDict, "imgSize");
- _numFilters = pyDictGetInt(paramsDict, "filters");
- _groups = pyDictGetIntV(paramsDict, "groups");
- _filterChannels = pyDictGetIntV(paramsDict, "filterChannels");
- _filterPixels = pyDictGetIntV(paramsDict, "filterPixels");
- _imgPixels = pyDictGetIntV(paramsDict, "imgPixels");
-
- _modulesX = pyDictGetInt(paramsDict, "modulesX");
- _modules = pyDictGetInt(paramsDict, "modules");
-}
-
-LocalLayer::~LocalLayer() {
- delete _padding;
- delete _stride;
- delete _filterSize;
- delete _channels;
- delete _imgSize;
- delete _groups;
- delete _filterChannels;
- delete _filterPixels;
- delete _imgPixels;
-}
-
-/*
- * =======================
- * ConvLayer
- * =======================
- */
-ConvLayer::ConvLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : LocalLayer(convNetThread, paramsDict, replicaID, true) {
- _sumWidth = pyDictGetInt(paramsDict, "sumWidth");
- _sharedBiases = pyDictGetInt(paramsDict, "sharedBiases");
- _weightContrastNormMin = pyDictGetFloatV(paramsDict, "wcNormMin");
- _weightContrastNormMax = pyDictGetFloatV(paramsDict, "wcNormMax");
-}
-
-ConvLayer::~ConvLayer() {
- delete _weightContrastNormMin;
- delete _weightContrastNormMax;
-}
-
-void ConvLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx),
- _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
-
- if (scaleTargets == 0) {
- if (_sharedBiases) {
- getActs().reshape(_numFilters, getActs().getNumElements() / _numFilters);
- getActs().addVector(getBiasMatrix(passType));
- getActs().reshape(_numFilters * _modules, getActs().getNumElements() / (_numFilters * _modules));
- } else {
- getActs().addVector(getBiasMatrix(passType));
- }
- }
-}
-
-void ConvLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
- float scaleBGrad = getBGradScale(passType);
- float scaleInc = getBIncScale();
- if (_sharedBiases) {
- v.reshape(_numFilters, v.getNumElements() / _numFilters);
- _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad);
- v.reshape(_numFilters * _modules, v.getNumElements() / (_numFilters * _modules));
- } else {
- _biases->getGrad().addSum(v, 1, scaleInc, scaleBGrad);
- }
-}
-
-void ConvLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
- assert(_weights->at(inpIdx).isUseGrad());
- bool doPartialSum = _sumWidth < _modulesX;
- NVMatrix& tgt = doPartialSum ? _weightGradTmp : _weights->at(inpIdx).getGrad();
-
- float scaleWGrad = getGradScale(inpIdx, passType);
- float scaleTargets = getIncScale(inpIdx, passType) * !doPartialSum;
-
- convWeightActs(*_inputs[inpIdx], v, tgt, _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx),
- _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), _sumWidth, scaleTargets, scaleWGrad);
-
- if (doPartialSum) {
- scaleTargets = _weights->at(inpIdx).getNumUpdates() > 0;
- int outWidth = DIVUP(_modulesX, _sumWidth);
- _weightGradTmp.reshape(outWidth*outWidth, _filterChannels->at(inpIdx) * _filterPixels->at(inpIdx) * _numFilters);
- _weights->at(inpIdx).getGrad().addSum(_weightGradTmp, 0, scaleTargets, 1);
- _weights->at(inpIdx).getGrad().reshape(_filterChannels->at(inpIdx) * _filterPixels->at(inpIdx), _numFilters);
- }
-}
-
-void ConvLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(), _imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX,
- _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
-}
-
-void ConvLayer::truncBwdActs() {
- LocalLayer::truncBwdActs();
- _weightGradTmp.truncate();
-}
-
-void ConvLayer::_constrainWeights() {
- for (int i = 0; i < _weights->getSize(); i++) {
- if (_weightContrastNormMax->at(i) > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
- float fz = _weights->at(i).getW().getNumRows();
- NVMatrix tmp;
- _weights->at(i).getW().sum(0, tmp);
- _weights->at(i).getW().addVector(tmp, -1.0f / fz, _weights->at(i).getGrad());
- // Now _weights->at(i).getGrad() contains zero-mean filters
- _weights->at(i).getGrad().apply(NVMatrixOps::Square());
- _weights->at(i).getGrad().sum(0, tmp);
-
- tmp.apply(WeightContrastNormOperator(_weightContrastNormMin->at(i), _weightContrastNormMax->at(i), 1.0f / fz));
- // Now tmp has the stdev
- _weights->at(i).getW().eltwiseMultByVector(tmp);
- }
- // It's pretty silly to do both these things but whatever
- if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
-// NVMatrix norm2;
- _weights->at(i).getW().sumOfSquares(0, _norm2);
-
-// norm.apply(MaxWeightConstraintOperator(_weights->at(i).getWBall()));
- _norm2.apply(HardWeightConstraintOperator(_weights->at(i).getWBall()));
- _weights->at(i).getW().eltwiseMultByVector(_norm2);
- }
- }
-}
-
-/*
- * =======================
- * LocalUnsharedLayer
- * =======================
- */
-LocalUnsharedLayer::LocalUnsharedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : LocalLayer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void LocalUnsharedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- localFilterActs(*_inputs[inpIdx], getWeightMatrix(passType, inpIdx), getActs(), _imgSize->at(inpIdx), _modulesX, _modulesX, _padding->at(inpIdx),
- _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
- if (scaleTargets == 0) {
- getActs().addVector(getBiasMatrix(passType));
- }
-}
-
-void LocalUnsharedLayer::bpropBiases(NVMatrix& v, PASS_TYPE passType) {
- _biases->getGrad().addSum(v, 1, getBIncScale(), getBGradScale(passType));
-}
-
-void LocalUnsharedLayer::bpropWeights(NVMatrix& v, int replicaIdx, int inpIdx, PASS_TYPE passType) {
- float scaleWGrad = getGradScale(inpIdx, passType);
- float scaleInc = getIncScale(inpIdx, passType);
- localWeightActs(*_inputs[inpIdx], v, getGradTarget(inpIdx), _imgSize->at(inpIdx), _modulesX, _modulesX, _filterSize->at(inpIdx), _padding->at(inpIdx),
- _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleInc, scaleWGrad);
-}
-
-void LocalUnsharedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- localImgActs(v, getWeightMatrix(passType, inpIdx), _prev[replicaIdx][inpIdx]->getActsGrad(),_imgSize->at(inpIdx), _imgSize->at(inpIdx), _modulesX,
- _padding->at(inpIdx), _stride->at(inpIdx), _channels->at(inpIdx), _groups->at(inpIdx), scaleTargets, 1);
-}
-
-void LocalUnsharedLayer::_constrainWeights() {
- for (int i = 0; i < _weights->getSize(); i++) {
- if (_weights->at(i).getWBall() > 0 && _weights->at(i).isOwner() && _weights->at(i).getLearningRateSchedule().getBaseValue() > 0) {
- normalizeLocalWeights(*_weights->at(i), _modules, _weights->at(i).getWBall());
- }
- }
-}
-
-/*
- * =======================
- * SoftmaxLayer
- * =======================
- */
-SoftmaxLayer::SoftmaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : Layer(convNetThread, paramsDict, replicaID, true), _doUpperGrad(false) {
-}
-
-void SoftmaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- NVMatrix& input = *_inputs[0];
- input.max(1, _max);
- input.addVector(_max, -1, getActs());
- getActs().apply(NVMatrixOps::Exp());
- getActs().sum(1, _sum);
- getActs().eltwiseDivideByVector(_sum);
-}
-
-void SoftmaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 0);
- LayerV& prev = _prev[replicaIdx];
- if (_doUpperGrad) {
- // Todo: rethink replica IDs or idxes... this here doesn't make a huge amount of sense
- for (int i = 0; i < _next.size(); ++i) {
- if (_next[i]->isGradProducer(getName())) {
- NVMatrix& labels = _next[i]->getPrev()[replicaIdx][0]->getActs(getDeviceID()); // Get cost's labels
- float gradCoeff = dynamic_cast<CostLayer*>(_next[i])->getCoeff();
-
- computeLogregSoftmaxGrad(labels, getActs(), prev[0]->getActsGrad(), scaleTargets == 1, gradCoeff);
- break;
- }
- }
-
- } else {
- computeSoftmaxGrad(getActs(), v, prev[0]->getActsGrad(), scaleTargets, 1);
- }
-}
-
-void SoftmaxLayer::setDoUpperGrad(bool b) {
- _doUpperGrad = b;
-}
-
-/*
- * =======================
- * ConcatenationLayer
- * =======================
- */
-ConcatenationLayer::ConcatenationLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : Layer(convNetThread, paramsDict, replicaID, false) {
- _copyOffsets = pyDictGetIntV(paramsDict, "copyOffsets");
- _copyOffsets->push_back(_numOutputs);
-}
-
-ConcatenationLayer::~ConcatenationLayer() {
- delete _copyOffsets;
-}
-
-void ConcatenationLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- getActs().resize(_numOutputs, _inputs[inpIdx]->getNumCols());
- _inputs[inpIdx]->copy(getActs(), 0, -1, 0, -1, _copyOffsets->at(inpIdx), 0);
-}
-
-void ConcatenationLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- NVMatrix& copySrc = v.sliceRows(_copyOffsets->at(inpIdx), _copyOffsets->at(inpIdx + 1)); // view
- _prev[replicaIdx][inpIdx]->getActsGrad().add(copySrc, scaleTargets, 1);
- delete ©Src;
-}
-
-/*
- * =======================
- * PassThroughLayer
- * =======================
- */
-PassThroughLayer::PassThroughLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : Layer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void PassThroughLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- // No-op
-}
-
-void PassThroughLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- // No-op
-}
-
-bool PassThroughLayer::postInit() {
- if (Layer::postInit()) {
- assert(getNumInputReplicas() == 1);
- for (int i = 0, offset = 0; i < _prev[0].size(); offset += _prev[0][i]->getNumOutputs(), i++) {
- MemoryView& vActs = _memSrcActs[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair<int,int>(offset, offset + _prev[0][i]->getNumOutputs()));
- MemoryView& vActsGrad = _memSrcActsGrad[getDeviceID()]->getMemorySource().addUser(_prev[0][i]->getName(), pair<int,int>(offset, offset + _prev[0][i]->getNumOutputs()));
- _prev[0][i]->setMemorySourceActs(getDeviceID(), vActs);
- _prev[0][i]->setMemorySourceActsGrad(getDeviceID(), vActsGrad);
- }
- return true;
- }
- return false;
-}
-
-
-/*
- * =======================
- * EltwiseSumLayer
- * =======================
- */
-EltwiseSumLayer::EltwiseSumLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
- _coeffs = pyDictGetFloatV(paramsDict, "coeffs");
-}
-
-EltwiseSumLayer::~EltwiseSumLayer() {
- delete _coeffs;
-}
-
-void EltwiseSumLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- getActs().add(*_inputs[inpIdx], scaleTargets, _coeffs->at(inpIdx));
-}
-
-void EltwiseSumLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- _prev[replicaIdx][inpIdx]->getActsGrad().add(v, scaleTargets, _coeffs->at(inpIdx));
-}
-
-/*
- * =======================
- * EltwiseMaxLayer
- * =======================
- */
-EltwiseMaxLayer::EltwiseMaxLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void EltwiseMaxLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (inpIdx == 1) { // First input, do nothing
- _inputs[inpIdx]->applyBinary(NVMatrixAggs::Max(), *_inputs[0], getActs());
- } else if (inpIdx > 1) {
- getActs().applyBinary(NVMatrixAggs::Max(), *_inputs[inpIdx]);
- }
-}
-
-void EltwiseMaxLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- computeEltwiseMaxGrad(v, *_inputs[inpIdx], getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), scaleTargets != 0);
-}
-
-
-/*
- * =======================
- * DropoutLayer
- * =======================
- *
- * TODO: optimize away the case when using dopout over relus. Don't need the keepmask.
- */
-DropoutLayer::DropoutLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
- _enable = pyDictGetInt(paramsDict, "enable");
- _keep = pyDictGetFloat(paramsDict, "keep");
-}
-
-void DropoutLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (_enable && passType == PASS_TRAIN) {
- _keepMask.resize(*_inputs[inpIdx]);
- _keepMask.randomizeUniform();
- _keepMask.apply(DropoutSmallerThanOperator(_keep));
- _inputs[inpIdx]->eltwiseMult(_keepMask, getActs());
- } else {
- _inputs[inpIdx]->copy(getActs());
- }
-}
-
-void DropoutLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- LayerV& prev = _prev[replicaIdx];
- if (_enable && passType == PASS_TRAIN) {
- if (scaleTargets != 0) {
- v.applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::Multiply>(NVMatrixBinaryOps::Multiply()),
- _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
- } else {
- v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad());
- }
- } else {
- prev[inpIdx]->getActsGrad().add(v, scaleTargets, 1);
- }
-}
-
-void DropoutLayer::truncBwdActs() {
- Layer::truncBwdActs();
- _keepMask.truncate();
-}
-
-
-/*
- * =======================
- * Dropout2Layer
- * =======================
- *
- * TODO: optimize away the case when using dopout over relus. Don't need the keepmask.
- */
-Dropout2Layer::Dropout2Layer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : DropoutLayer(convNetThread, paramsDict, replicaID) {
-}
-
-void Dropout2Layer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (_enable && passType == PASS_TRAIN) {
- _keepMask.resize(*_inputs[inpIdx]);
- _keepMask.randomizeUniform();
- _keepMask.smallerThanScalar(_keep);
- _inputs[inpIdx]->eltwiseMult(_keepMask, getActs());
- } else {
- _inputs[inpIdx]->scale(_keep, getActs());
- }
-}
-
-void Dropout2Layer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- LayerV& prev = _prev[replicaIdx];
- if (_enable && passType == PASS_TRAIN) {
- if (scaleTargets != 0) {
- v.applyTernary(AddGradientBinaryOperator<NVMatrixBinaryOps::Multiply>(NVMatrixBinaryOps::Multiply()),
- _keepMask, prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
- } else {
- v.eltwiseMult(_keepMask, prev[inpIdx]->getActsGrad());
- }
- } else {
- if (scaleTargets != 0) {
- v.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_keep)),
- prev[inpIdx]->getActsGrad(), prev[inpIdx]->getActsGrad());
- } else {
- v.scale(_keep, prev[inpIdx]->getActsGrad());
- }
- }
-}
-
-/*
- * =======================
- * DataLayer
- * =======================
- */
-DataLayer::DataLayer(ConvNet* convNet, PyObject* paramsDict, int replicaID) : Layer(NULL, paramsDict, replicaID, false) {
- _dataIdx = pyDictGetInt(paramsDict, "dataIdx");
- _start = pyDictGetInt(paramsDict, "start");
- _end = pyDictGetInt(paramsDict, "end");
- _useBuffer = false;
- _outstandingCopyRequest = false;
- _convNet = convNet;
-}
-
-DataLayer::~DataLayer() {
- for (map<int,cudaStream_t>::const_iterator it = _copyStreams.begin(); it != _copyStreams.end(); ++it) {
- checkCudaErrors(cudaStreamDestroy(it->second));
- }
- for (std::map<int, MemoryView*>::iterator it = _memSrcActs2.begin(); it != _memSrcActs2.end(); ++it) {
- if (it->second->getMemorySource().truncate(_name)) {
- delete &it->second->getMemorySource();
- }
- }
- _copier->stop();
- delete _copier;
-}
-
-void DataLayer::fprop(PASS_TYPE passType, int passIdx, bool fromBuffer) {
- waitForCopyFinish();
- if (fromBuffer && getFwdActiveInputReplicaIdx(passIdx) >= 0) {
- _useBuffer = !_useBuffer;
- }
-
- for (int i = 0; i < _next.size(); i++) {
- _next[i]->getConvNetThread().getMessageQueue().enqueue(new FpropMessage(*_next[i], passType, passIdx));
- }
-}
-
-void DataLayer::waitForCopyFinish() {
- if (_outstandingCopyRequest) {
- _copyFinishQueue.dequeue();
- assert(_copyFinishQueue.getNumElements() == 0);
- _outstandingCopyRequest = false;
- }
-}
-
-cudaStream_t DataLayer::getCopyStream(int deviceID) {
- if (_copyStreams.count(deviceID) == 0) {
- NVMatrix::setDeviceID(deviceID);
- checkCudaErrors(cudaStreamCreateWithFlags(&_copyStreams[deviceID], cudaStreamNonBlocking));
- }
- return _copyStreams[deviceID];
-}
-
-void DataLayer::copyData(CPUData& data, bool other, int passIdx) {
- assert(!_outstandingCopyRequest);
- assert(_copyFinishQueue.getNumElements() == 0);
- _copier->getQueue().enqueue(new DataCopyMessage(data, other, passIdx));
- _outstandingCopyRequest = true;
-}
-
-int DataLayer::getNumInputReplicas() {
- return _convNet->getNumReplicasMax() / getNumReplicas();
-}
-
-void DataLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
-
-}
-
-NVMatrix& DataLayer::getActs(int deviceID) {
- return getActs(deviceID, false, -1);
-}
-
-NVMatrix& DataLayer::getActs(int deviceID, bool other, int numCases) {
-// printf("%s[%d] getActs(%d, %d, %d)\n", _name.c_str(), getReplicaID(), deviceID, other, numCases);
- assert(_memSrcActs.count(deviceID) > 0);
- assert(_memSrcActs2.count(deviceID) > 0);
- return (_useBuffer != other ? _memSrcActs2[deviceID]->getMemory(numCases) : _memSrcActs[deviceID]->getMemory(numCases));
-}
-
-ConvNet& DataLayer::getConvNet() {
- return *_convNet;
-}
-
-bool DataLayer::postInit() {
- if (Layer::postInit()) {
- for (int i = 0; i < _next.size(); ++i) {
- int d = _next[i]->getDeviceID();
- if (_memSrcActs2.count(d) == 0) {
- _memSrcActs2[d] = &MemorySource::make(_numOutputs, d, getName());
- }
- }
- intv cpus = getDeviceCPUs(_next[0]->getDeviceID());
- _copier = new DataCopyThread(*this, cpus);
- _copier->start();
- return true;
- }
- return false;
-}
-
-bool DataLayer::isGradProducer() {
- return false;
-}
-
-/*
- * =======================
- * DataCopyThread
- * =======================
- */
-DataCopyThread::DataCopyThread(DataLayer& parent, intv& cpus) : _parent(&parent), _sleepUsec(0), Thread(true, cpus) {
-}
-
-Queue<DataCopyMessage*>& DataCopyThread::getQueue() {
- return _queue;
-}
-
-void DataCopyThread::stop() {
- getQueue().enqueue(new DataCopyExitMessage());
- join();
-}
-
-void* DataCopyThread::run() {
- NVMatrix::setDeviceID(*_parent->getNextDeviceIDs().begin());
- bool exit = false;
- while(!exit) {
- DataCopyMessage& msg = *_queue.dequeue();
- exit = msg.getType() == DataCopyMessage::EXIT;
- if (!exit) {
- CPUData& data = msg.getData();
- int passIdx = msg.getPassIdx();
- bool other = msg.isOther();
-
- Matrix& dataMatrix = data.getData(_parent->getDataIdx());
- // How many times is this layer going to process microbatches from this minibatch?
- assert(_parent->getNumReplicasNext() == _parent->getNumReplicas());
- int microIdx = _parent->getFwdActiveInputReplicaIdx(passIdx);
-
- if (microIdx >= 0) {
- if (_requestTimer.isStarted()) {
- double requestIntervalMsec = _requestTimer.stop();
- // Sleep for up to 1/20th the average request interval
- _sleepUsec = int(round(0.95 * _sleepUsec + 0.05 * (_parent->getReplicaID() / double(_parent->getNumReplicas())) * requestIntervalMsec * 1000.0 / 20.0));
- }
- _requestTimer.start();
- if (other) {
- // Sleeping a bit is helpful because in typical nets, copying input data
- // as soon as it's available will produce contention with other communications
- // that are happening at the time. This is very much a hack, so in the future
- // it might be good to replace it with something smarter which schedules access
- // to communication links.
- usleep(_sleepUsec);
- }
- microIdx += _parent->getReplicaID() * _parent->getNumInputReplicas();
- // Safer to divup because this way you won't get a minibatch size of 0
- int microbatchSize = DIVUP(data.getNumCases(), _parent->getConvNet().getNumReplicasMax());
- int microStart = microIdx * microbatchSize;
- int microEnd = min(data.getNumCases(), (microIdx + 1) * microbatchSize);
- // Check that this replica has some data. This can be false when, for example,
- // there are only 7 examples in the minibatch but 8 replicas.
- if (microStart < microEnd) {
- assert(dataMatrix.isView() == dataMatrix.isTrans());
- int pipe = _parent->getConvNet().getDataCopyPD().getPipe(_parent->getReplicaID()/2);
- if (dataMatrix.isTrans()) {
- Matrix& replicaDataMatrix = dataMatrix.sliceCols(microStart, microEnd);
- // In this case, dataMatrix is a view on memory allocated by Python.
- //_hostMemFwd.copyFromHost(replicaDataMatrix, true);
- _hostMemFwd.resize(replicaDataMatrix.getNumRows(), replicaDataMatrix.getNumCols(), true);
- memcpy(_hostMemFwd.getDevData(), replicaDataMatrix.getData(), replicaDataMatrix.getNumDataBytes());
- delete &replicaDataMatrix; // view
- NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd());
- for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
- int deviceID = *it;
- // Copy my output to this guy's GPU
- NVMatrix::setDeviceID(deviceID);
- // Note to self: this is the path that gets executed in practice
- // in my models. It does a transpose & copy simultaneously.
- hostMemFwdSlice.flipTrans(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID));
- }
- delete &hostMemFwdSlice;
- } else {
- // Hacky way to copy a slice to _hostMemFwd
- _hostMemFwd.resize(dataMatrix.getNumRows(), microEnd - microStart);
- Matrix tmp(_hostMemFwd.getDevData(), _hostMemFwd.getNumRows(), _hostMemFwd.getNumCols(), _hostMemFwd.isTrans());
- dataMatrix.sliceCols(microStart, microEnd, tmp);
- NVMatrix& hostMemFwdSlice = _hostMemFwd.sliceRows(_parent->getStart(), _parent->getEnd());
- for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
- int deviceID = *it;
- // Copy my output to this guy's GPU
- NVMatrix::setDeviceID(deviceID);
- hostMemFwdSlice.copy(_parent->getActs(deviceID, other, microEnd - microStart), _parent->getCopyStream(deviceID));
- }
- delete &hostMemFwdSlice;
- }
-
- for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
- int deviceID = *it;
- NVMatrix::setDeviceID(deviceID);
- NVMatrix::syncStream(_parent->getCopyStream(deviceID));
- }
- _parent->getConvNet().getDataCopyPD().freePipe(pipe);
- } else {
- for (intv::iterator it = _parent->getNextDeviceIDs().begin(); it != _parent->getNextDeviceIDs().end(); ++it) {
- int deviceID = *it;
- _parent->getActs(deviceID, other, 0);
- }
- }
- }
- _parent->getCopyFinishQueue().enqueue(1);
- }
- delete &msg;
- }
- return NULL;
-}
-
-/*
- * =====================
- * PoolLayer
- * =====================
- */
-PoolLayer::PoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
- : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) {
- _sizeX = pyDictGetInt(paramsDict, "sizeX");
- _start = pyDictGetInt(paramsDict, "start");
- _stride = pyDictGetInt(paramsDict, "stride");
- _outputsX = pyDictGetInt(paramsDict, "outputsX");
- _pool = pyDictGetString(paramsDict, "pool");
-}
-
-PoolLayer& PoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) {
- std::string _pool = pyDictGetString(paramsDict, "pool");
- if (_pool == "max") {
- return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, false);
- } else if(_pool == "maxabs") {
- return *new MaxPoolLayer(convNetThread, paramsDict, replicaID, true);
- } else if(_pool == "avg") {
- return *new AvgPoolLayer(convNetThread, paramsDict, replicaID);
- }
- throw std::string("Unknown pooling layer type ") + _pool;
-}
-
-/*
- * =====================
- * AvgPoolLayer
- * =====================
- */
-AvgPoolLayer::AvgPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : PoolLayer(convNetThread, paramsDict, replicaID, false) {
- _sum = pyDictGetInt(paramsDict, "sum");
-}
-
-void AvgPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (_sum) {
- convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler<true>());
- } else {
- convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, AvgPooler<false>());
- }
-}
-
-void AvgPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convLocalAvgUndo(v, _prev[replicaIdx][0]->getActsGrad(), _sizeX, _start, _stride, _outputsX, _imgSize, _sum, scaleTargets, 1);
-}
-
-/*
- * =====================
- * MaxPoolLayer
- * =====================
- */
-MaxPoolLayer::MaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool abs) : PoolLayer(convNetThread, paramsDict, replicaID, false), _abs(abs) {
-}
-
-void MaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (_abs) {
- convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxAbsPooler());
- } else {
- convLocalPool(*_inputs[0], getActs(), _channels, _sizeX, _start, _stride, _outputsX, MaxPooler());
- }
-}
-
-void MaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 0);
- convLocalMaxUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _sizeX, _start, _stride, _outputsX, scaleTargets, 1);
-}
-
-/*
- * =====================
- * CrossMapPoolLayer
- * =====================
- */
-CrossMapPoolLayer::CrossMapPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
- : Layer(convNetThread, paramsDict, replicaID, trans), TwoDLayerInterface(paramsDict) {
- _size = pyDictGetInt(paramsDict, "size");
- _start = pyDictGetInt(paramsDict, "start");
- _stride = pyDictGetInt(paramsDict, "stride");
- _outputs = pyDictGetInt(paramsDict, "outputChannels");
- _pool = pyDictGetString(paramsDict, "pool");
-}
-
-CrossMapPoolLayer& CrossMapPoolLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) {
- std::string _pool = pyDictGetString(paramsDict, "pool");
- if (_pool == "max") {
- return *new CrossMapMaxPoolLayer(convNetThread, paramsDict, replicaID);
- }
- throw std::string("Unknown pooling layer type ") + _pool;
-}
-
-/*
- * =====================
- * CrossMapMaxPoolLayer
- * =====================
- */
-CrossMapMaxPoolLayer::CrossMapMaxPoolLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CrossMapPoolLayer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void CrossMapMaxPoolLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convPoolCrossMap(*_inputs[0], getActs(), _start, _size, _outputs, _stride, _imgSize, MaxPooler());
-}
-
-void CrossMapMaxPoolLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 0);
- convCrossMapMaxPoolUndo(*_inputs[0], v, getActs(), _prev[replicaIdx][0]->getActsGrad(), _imgSize, _start, _size, _stride, scaleTargets, 1);
-}
-
-/*
- * =====================
- * RandomScaleLayer
- * =====================
- */
-RandomScaleLayer::RandomScaleLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _maxScale = pyDictGetFloat(paramsDict, "maxScale");
- _tgtSize = pyDictGetInt(paramsDict, "tgtSize");
- // The smallest size the image could be after rescaling
- _minScaledSize = _imgSize / _maxScale;
-
- // The number of discrete scales we're considering
- int numScales = _imgSize - _minScaledSize + 1;
-
- // The total number of squares of size _tgtSize that we can extract
- // from all these scales
- double numCrops = numScales * (numScales + 1) * (2 * numScales + 1) / 6;
-
- // For each scale, record the fraction of the squares that it has.
- // This will be the probability of sampling this scale.
- _scaleProbs.push_back(1.0 / numCrops);
- for (int s = 1; s < numScales; ++s) {
- _scaleProbs.push_back(_scaleProbs[s-1] + (s + 1) * (s + 1) / numCrops);
- }
-}
-
-void RandomScaleLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- if (IS_TRAIN(passType)) {
- // _maxScale is in the range [1, 2)
- float r = randf;
- int rescaledSize = _tgtSize;
- float scaleFactor = _maxScale;
- // Find which scale we have sampled
- for (int s = 0; s < _scaleProbs.size(); ++s) {
- if (r <= _scaleProbs[s]) {
- rescaledSize += s;
- float scaleFactorEnd = _imgSize / float(rescaledSize);
- float scaleFactorStart = max(1.0, _imgSize / (1.0 + rescaledSize));
- scaleFactor = scaleFactorStart + randf * (scaleFactorEnd - scaleFactorStart);
- break;
- }
- }
- assert(rescaledSize >= _tgtSize);
- int maxStart = rescaledSize - _tgtSize;
- int startY = rand() % (1 + maxStart), startX = rand() % (1 + maxStart);
- if (rescaledSize == _imgSize) {
- convCrop(*_inputs[0], getActs(), rescaledSize, _tgtSize, startY, startX);
- } else {
- convResizeBilinear(*_inputs[0], _rescaledActs, _imgSize, rescaledSize, scaleFactor);
- convCrop(_rescaledActs, getActs(), rescaledSize, _tgtSize, startY, startX);
- }
- _rescaledActs.truncate(); // this'll have a different size each time so may as well truncate it.
- } else if (IS_MULTIVIEW_TEST(passType)) { // for now...
- _inputs[0]->copy(getActs());
- } else if (IS_TEST(passType)) { // Test on center patch
- convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _maxScale);
- }
-}
-
-void RandomScaleLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(false);
-}
-
-/*
- * =====================
- * CropLayer
- * =====================
- */
-CropLayer::CropLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _startX = pyDictGetInt(paramsDict, "startX");
- _startY = pyDictGetInt(paramsDict, "startY");
- _tgtSize = pyDictGetInt(paramsDict, "sizeX");
-}
-
-void CropLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convCrop(*_inputs[0], getActs(), _imgSize, _tgtSize, _startY, _startX);
-}
-
-void CropLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(false);
-}
-
-/*
- * =====================
- * NailbedLayer
- * =====================
- */
-NailbedLayer::NailbedLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _start = pyDictGetInt(paramsDict, "start");
- _stride = pyDictGetInt(paramsDict, "stride");
- _outputsX = pyDictGetInt(paramsDict, "outputsX");
-}
-
-void NailbedLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convBedOfNails(*_inputs[0], getActs(), _channels, _imgSize, _start, _stride, 0, 1);
-}
-
-void NailbedLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convBedOfNailsUndo(v, _prev[replicaIdx][0]->getActsGrad(), _channels, _imgSize, _start, _stride, scaleTargets, 1);
-}
-
-/*
- * =====================
- * GaussianBlurLayer
- * =====================
- */
-GaussianBlurLayer::GaussianBlurLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _hFilter = pyDictGetMatrix(paramsDict, "filter");
-}
-
-GaussianBlurLayer::~GaussianBlurLayer() {
- delete _hFilter;
-}
-
-void GaussianBlurLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convGaussianBlur(*_inputs[0], _filter, getActs(), true, _channels, 0, 1);
- convGaussianBlur(getActs(), _filter, getActs(), false, _channels, 0, 1);
-}
-
-void GaussianBlurLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- NVMatrix& tgt = _prev[replicaIdx][0]->getNumComputedActsGrads(getDeviceID()) > 0 ? _actGradsTmp : _prev[replicaIdx][0]->getActsGrad();
- convGaussianBlur(v, _filter, tgt, true, _channels, 0, 1);
- convGaussianBlur(tgt, _filter, _prev[replicaIdx][0]->getActsGrad(), false, _channels, scaleTargets, 1);
-}
-
-void GaussianBlurLayer::copyToGPU() {
- _filter.copyFromHost(*_hFilter, true);
-}
-
- /*
- * =====================
- * HorizontalReflectionLayer
- * =====================
- */
-HorizontalReflectionLayer::HorizontalReflectionLayer(ConvNetThread* convNet, PyObject* paramsDict, int replicaID) : Layer(convNet, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- assert(_channels >= 1 && _channels <= 3);
-}
-
-void HorizontalReflectionLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convReflectHorizontal(*_inputs[0], getActs(), _imgSize);
-}
-
-void HorizontalReflectionLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convReflectHorizontal(v, _prev[replicaIdx][0]->getActsGrad(), _imgSize);
-}
-
-/*
- * =====================
- * ResizeLayer
- * =====================
- */
-ResizeLayer::ResizeLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _tgtSize = pyDictGetInt(paramsDict, "tgtSize");
- _scale = pyDictGetFloat(paramsDict, "scale");
-}
-
-void ResizeLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convResizeBilinear(*_inputs[0], getActs(), _imgSize, _tgtSize, _scale);
-}
-
-// Can't do this
-void ResizeLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(false);
-}
-
-/*
- * =====================
- * RGBToYUVLayer
- * =====================
- */
-RGBToYUVLayer::RGBToYUVLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void RGBToYUVLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convRGBToYUV(*_inputs[0], getActs());
-}
-
-// Can't do this
-void RGBToYUVLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(false);
-}
-
-/*
- * =====================
- * RGBToLABLayer
- * =====================
- */
-RGBToLABLayer::RGBToLABLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : Layer(convNetThread, paramsDict, replicaID, false) {
- _center = pyDictGetInt(paramsDict, "center");
-}
-
-void RGBToLABLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convRGBToLAB(*_inputs[0], getActs(), _center);
-}
-
-// Can't do this
-void RGBToLABLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(false);
-}
-
-/*
- * =====================
- * ResponseNormLayer
- * =====================
- */
-ResponseNormLayer::ResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
-: Layer(convNetThread, paramsDict, replicaID, false), TwoDLayerInterface(paramsDict) {
- _size = pyDictGetInt(paramsDict, "size");
- _scale = pyDictGetFloat(paramsDict, "scale");
- _pow = pyDictGetFloat(paramsDict, "pow");
- _minDiv = pyDictGetFloat(paramsDict, "minDiv");
-}
-
-void ResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- convResponseNorm(*_inputs[0], _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv);
-}
-
-void ResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convResponseNormUndo(v, _denoms, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1);
-}
-
-void ResponseNormLayer::truncBwdActs() {
- Layer::truncBwdActs();
- _denoms.truncate();
-}
-
-/*
- * =====================
- * CrossMapResponseNormLayer
- * =====================
- */
-CrossMapResponseNormLayer::CrossMapResponseNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
-: ResponseNormLayer(convNetThread, paramsDict, replicaID) {
- _blocked = pyDictGetInt(paramsDict, "blocked");
-}
-
-void CrossMapResponseNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- assert(inpIdx == 0);
- convResponseNormCrossMap(*_inputs[0], getActs(), _channels, _size, _scale, _pow, _minDiv, _blocked);
-}
-
-void CrossMapResponseNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convResponseNormCrossMapUndo(v, *_inputs[0], getActs(), _prev[replicaIdx][0]->getActsGrad(), _channels, _size, _scale, _pow, _minDiv, _blocked, scaleTargets, 1);
-}
-
-/*
- * =====================
- * ContrastNormLayer
- * =====================
- */
-ContrastNormLayer::ContrastNormLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : ResponseNormLayer(convNetThread, paramsDict, replicaID) {
-}
-
-void ContrastNormLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- NVMatrix& images = *_inputs[0];
- convLocalPool(images, _meanDiffs, _channels, _size, -_size/2, 1, _imgSize, AvgPooler<false>());
- _meanDiffs.add(images, -1, 1);
- convContrastNorm(images, _meanDiffs, _denoms, getActs(), _channels, _size, _scale, _pow, _minDiv);
-}
-
-void ContrastNormLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- convContrastNormUndo(v, _denoms, _meanDiffs, getActs(), _prev[replicaIdx][inpIdx]->getActsGrad(), _channels, _size, _scale, _pow, scaleTargets, 1);
-}
-
-void ContrastNormLayer::truncBwdActs() {
- ResponseNormLayer::truncBwdActs();
- _meanDiffs.truncate();
-}
-
-/*
- * =====================
- * CostLayer
- * =====================
- */
-CostLayer::CostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID, bool trans)
- : Layer(convNetThread, paramsDict, replicaID, trans) {
- _coeff = pyDictGetFloat(paramsDict, "coeff");
- _numCases = 0;
- _aggregated = pyDictGetInt(paramsDict, "aggregated") != 0;
-}
-
-float CostLayer::getCoeff() {
- return _coeff;
-}
-
-void CostLayer::bprop(NVMatrix& v, PASS_TYPE passType, int passIdx) {
- if (_coeff != 0) {
- Layer::bprop(v, passType, passIdx);
- }
-}
-
-bool CostLayer::fprop(PASS_TYPE passType, int passIdx) {
- if (Layer::fprop(passType, passIdx)) {
- syncStream();
- getConvNet().getMessageQueue().enqueue(new Message(FPROP_TERMINAL));
- return true;
- }
- return false;
-}
-
-void CostLayer::fpropCommon(PASS_TYPE passType) {
- _numCases = Layer::getNumCases(*_inputs[0]);
-}
-
-int CostLayer::getNumCases() {
- return _numCases;
-}
-
-bool CostLayer::isGradProducer() {
- return _coeff != 0;
-}
-
-doublev& CostLayer::getCost() {
- return *new doublev(_costv);
-}
-
-// This is called between microbatches
-void CostLayer::resetPassIdx() {
- Layer::resetPassIdx();
- _costv.clear();
-}
-
-CostLayer& CostLayer::make(ConvNetThread* convNetThread, PyObject* paramsDict, std::string& type, int replicaID) {
- if (type == "cost.crossent") {
- return *new CrossEntCostLayer(convNetThread, paramsDict, replicaID);
- } else if (type == "cost.bce") {
- return *new BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID);
- } else if (type == "cost.dce") {
- return *new DetectionCrossEntropyCostLayer(convNetThread, paramsDict, replicaID);
- } else if (type == "cost.logreg") {
- return *new LogregCostLayer(convNetThread, paramsDict, replicaID);
- } else if (type == "cost.sum2") {
- return *new SumOfSquaresCostLayer(convNetThread, paramsDict, replicaID);
- }
- throw std::string("Unknown cost layer type ") + type;
-}
-
-/*
- * =====================
- * CrossEntCostLayer
- * =====================
- */
-CrossEntCostLayer::CrossEntCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void CrossEntCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- // This layer uses its two inputs together
- if (inpIdx == 0) {
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- int numCases = labels.getLeadingDim();
- computeCrossEntCost(labels, probs, _trueLabelLogProbs, _correctProbs);
- _costv.clear();
- _costv.push_back(-_trueLabelLogProbs.sum());
- _costv.push_back(numCases - _correctProbs.sum());
- }
-}
-
-void CrossEntCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 1);
- LayerV& prev = _prev[replicaIdx];
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- NVMatrix& target = prev[1]->getActsGrad();
- // Numerical stability optimization: if the layer below me is a softmax layer, let it handle
- // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
- bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax" || prev[1]->getDeviceID() != getDeviceID();
- if (doWork) {
- computeCrossEntGrad(labels, probs, target, scaleTargets == 1, _coeff);
- }
-}
-
-/*
- * =====================
- * BinomialCrossEntropyCostLayer
- * =====================
- */
-BinomialCrossEntropyCostLayer::BinomialCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
- _computeSoftmaxErrorRate = pyDictGetInt(paramsDict, "computeSoftmaxErrorRate");
- _posWeight = pyDictGetFloat(paramsDict, "posWeight");
-}
-
-void BinomialCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- // This layer uses its two inputs together
- if (inpIdx == 0) {
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- int numCases = labels.getLeadingDim();
- labels.applyBinary(BinomialCrossEntOperator(_posWeight), probs, _tmpProbs);
- _costv.clear();
- // Cross-entropy cost
- _costv.push_back(-_tmpProbs.sum(_tmpbuf));// / labels.getFollowingDim());
-
- // If aggregated, we don't produce these outputs because they're not additive.
- // They have no meaning if this is just a partial cost.
- if (!_aggregated) {
- // "Correct" classifications. To compute these we threshold probs
- // and just count the number of entries that agree with labels.
- probs.biggerThanScalar(0.5, _tmpProbs);
- _tmpProbs.equals(labels);
- _costv.push_back((_tmpProbs.getNumElements() - _tmpProbs.sum(_tmpbuf)) / double(labels.getFollowingDim()));
-
- if (_computeSoftmaxErrorRate) {
- // Also compute top-1 error as if this is softmax and there's only one correct class
- probs.max(0, _tmpVec);
- assert(_tmpVec.getNumElements() == numCases); // Make sure we did max on correct axis
- probs.equalsVector(_tmpVec, _correctProbs);
- _correctProbs.sum(0, _tmpVec); // Divide by the # of labels that we predict as being present
- float m = _tmpVec.max();
-
- _correctProbs.eltwiseDivideByVector(_tmpVec);
- _correctProbs.eltwiseMult(labels);
-
- _costv.push_back(numCases - _correctProbs.sum(_tmpbuf));
- }
- }
- }
-}
-
-void BinomialCrossEntropyCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- assert(inpIdx == 1);
- LayerV& prev = _prev[replicaIdx];
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- NVMatrix& target = prev[1]->getActsGrad();
- // Numerical stability optimization: if the layer below me is a logistic neuron layer, let it handle
- // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
- bool doWork = prev[1]->getNext().size() > 1
- || prev[1]->getType() != "neuron"
- || static_cast<NeuronLayer*>(prev[1])->getNeuronType() != "logistic"
- || prev[1]->getDeviceID() != getDeviceID()
- || prev[1]->getNumReplicas() != getNumReplicas();
- if (doWork) {
- printf("Computing cross-entropy gradient the stupid way\n");
- if (scaleTargets == 0) {
- labels.applyBinary(BinomialCrossEntGradientOperator(_coeff, _posWeight), probs, target);
- } else {
- labels.applyTernary(AddGradientBinaryOperator<BinomialCrossEntGradientOperator>(BinomialCrossEntGradientOperator(_coeff, _posWeight)), probs, target, target);
- }
- }
-}
-
-float BinomialCrossEntropyCostLayer::getPosWeight() {
- return _posWeight;
-}
-/*
- * =====================
- * DetectionCrossEntropyCostLayer
- * =====================
- */
-DetectionCrossEntropyCostLayer::DetectionCrossEntropyCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID)
- : BinomialCrossEntropyCostLayer(convNetThread, paramsDict, replicaID) {
- assert(!_aggregated);
-}
-
-void DetectionCrossEntropyCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- BinomialCrossEntropyCostLayer::fpropActs(inpIdx, scaleTargets, passType, passIdx);
- // This layer uses its two inputs together
- if (inpIdx == 0) {
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- int numCases = labels.getLeadingDim();
-
- /*
- * Add information sufficient to compute precision and recall for each class.
- */
- // NOTE: _tmpProbs contains ((probs > 0.5) == labels)
- labels.sum(1, _numPositive); // sum(labels, 1)
-
- _tmpProbs.eltwiseMult(labels); // labels * ((probs > 0.5) == labels)
- _tmpProbs.sum(1, _numTruePositive);
-
- probs.biggerThanScalar(0.5, _tmpProbs);
- _tmpProbs.sum(1, _numDeclaredPositive);
-
- _numDeclaredPositive.copyToHost(_hNumDeclaredPositive, true);
- _numPositive.copyToHost(_hNumPositive, true);
- _numTruePositive.copyToHost(_hNumTruePositive, true);
-
- for (int i = 0; i < labels.getFollowingDim(); ++i) {
- _costv.push_back(_hNumDeclaredPositive(i, 0)); // 2
- _costv.push_back(_hNumPositive(i, 0)); // 3
- _costv.push_back(_hNumTruePositive(i, 0)); // 4
- }
-
- }
-}
-
-/*
- * =====================
- * LogregCostLayer
- * =====================
- */
-LogregCostLayer::LogregCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
- _topk = pyDictGetInt(paramsDict, "topk");
-// _numAccumed = 0;
-}
-
-void LogregCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- // This layer uses its two inputs together
- if (inpIdx == 0) {
- NVMatrix& labels = *_inputs[0];
- NVMatrix* probs = _inputs[1];
-
- _doCompute = !IS_MULTIVIEW_TEST(passType);
- if (!_doCompute) {
- if (IS_MULTIVIEW_TEST_START(passType)) {
- if (_probsAccum.count(passIdx) == 0) {
- _probsAccum[passIdx] = new NVMatrix(*probs);
- }
- probs->copy(*_probsAccum[passIdx]);
- _numAccumed[passIdx] = 1;
- } else {
- _probsAccum[passIdx]->add(*probs);
- _numAccumed[passIdx] += 1;
- }
- if (IS_MULTIVIEW_TEST_END(passType)) {
- probs = _probsAccum[passIdx];
- probs->scale(1.0 / _numAccumed[passIdx]);
- _doCompute = true;
- }
- }
- if (_doCompute) {
- int numCases = labels.getNumElements();
- probs->max(0,_maxProbs);
- if (_topk == 1) {
- computeLogregCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs);
- } else {
- computeMultiSoftmaxCost(labels, *probs, _maxProbs, _trueLabelLogProbs, _correctProbs, _topkProbs, _topk);
- }
- _costv.clear();
- double top1 = _correctProbs.sum(_tmpbuf);
-
- _costv.push_back(-_trueLabelLogProbs.sum(_tmpbuf));
- _costv.push_back(numCases - top1);
- _costv.push_back(numCases - (_topk == 1 ? top1 : _topkProbs.sum(_tmpbuf)));
-
- }
- }
-}
-
-NVMatrix& LogregCostLayer::getProbsAccum(int replicaIdx) {
- return *_probsAccum[replicaIdx];
-}
-
-void LogregCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- if (inpIdx == 1) {
- LayerV& prev = _prev[replicaIdx];
- NVMatrix& labels = *_inputs[0];
- NVMatrix& probs = *_inputs[1];
- NVMatrix& target = prev[1]->getActsGrad();
- // Numerical stability optimization: if the layer below me is a softmax layer, let it handle
- // the entire gradient computation to avoid multiplying and dividing by a near-zero quantity.
- bool doWork = prev[1]->getNext().size() > 1 || prev[1]->getType() != "softmax"
- || prev[1]->getDeviceID() != getDeviceID() || prev[1]->getNumReplicas() != getNumReplicas();
- if (prev[1]->getType() == "softmax") {
- static_cast<SoftmaxLayer*>(prev[1])->setDoUpperGrad(!doWork);
- }
- if (doWork) {
- computeLogregGrad(labels, probs, target, scaleTargets == 1, _coeff);
- }
- }
-}
-
-/*
- * =====================
- * SumOfSquaresCostLayer
- * =====================
- */
-SumOfSquaresCostLayer::SumOfSquaresCostLayer(ConvNetThread* convNetThread, PyObject* paramsDict, int replicaID) : CostLayer(convNetThread, paramsDict, replicaID, false) {
-}
-
-void SumOfSquaresCostLayer::fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType, int passIdx) {
- _inputs[0]->apply(NVMatrixOps::Square(), _tmp);
- _costv.clear();
- _costv.push_back(_tmp.sum());
-}
-
-void SumOfSquaresCostLayer::bpropActs(NVMatrix& v, int replicaIdx, int inpIdx, float scaleTargets, PASS_TYPE passType) {
- _prev[replicaIdx][inpIdx]->getActsGrad().add(*_inputs[0], scaleTargets, -2 * _coeff);
-}
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <vector>
-#include <cmath>
-#include "../include/layer_kernels.cuh"
-
-using namespace std;
-
-/*
- * E = -log(y_t)
- * probs: (numOut, numCases)
- * labels: (1, numCases)
- * maxEnergies: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- * top5Probs: (1, numCases) (*out)
- *
- * target: (1, numCases)
- *
- */
-__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
- float* labelLogProbs, float* correctProbs, float* top5Probs,
- const int numCases, const int numOut, const int setSize) {
- const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
-
- if (tx < numCases) {
- const int label = int(labels[tx]);
- const float maxp = maxProbs[tx];
- const float labelp = probs[label * numCases + tx];
-
- labelLogProbs[tx] = __logf(labelp);
-
- int numBiggerProbs = 0, numEqualsProbs = 0;
- for (int i = 0; i < numOut; ++i) {
- numBiggerProbs += probs[i * numCases + tx] > labelp;
- numEqualsProbs += probs[i * numCases + tx] == labelp;
- }
-
- const int slotsLeft = setSize - numBiggerProbs;
-
- top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
- correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
- }
-}
-
-/*
- * E = -log(y_t)
- * probs: (numOut, numCases)
- * labels: (1, numCases)
- * maxProbs: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- * top5Probs: (1, numCases) (*out)
- *
- * target: (1, numCases) == log(y_l[labels,:]
- */
-void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out,
- NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize) {
- int numCases = probs.getNumCols();
- int numOut = probs.getNumRows();
-
- assert(labels.getNumElements() == numCases);
- assert(!labels.isTrans());
- assert(!probs.isTrans());
- assert(labels.isContiguous());
- assert(probs.isContiguous());
-
-// NVMatrix& maxProbs = probs.max(0);
-
- labelLogProbs_out.resize(1, numCases);
- correctProbs_out.resize(1, numCases);
- top5Probs_out.resize(1, numCases);
- dim3 threads(LOGREG_ERR_THREADS_X, 1);
- dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
- cudaStream_t stream = NVMatrix::getDefaultStream();
-
- cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
- kMultiSoftmaxCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
- labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
- numCases, numOut, setSize);
-
- getLastCudaError("kMultiSoftmaxCost: Kernel execution failed");
-// cudaThreadSynchronize();
-}
-
-/*
- * E = sum(p_l * log(y_l))
- * probs: (numOut, numCases)
- * labels: (numOut, numCases)
- * maxProbs: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- *
- * target: (1, numCases)
- */
-__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
- const int numCases, const int numOut) {
- const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
-
- if (tx < numCases) {
- probs += tx;
- labels += tx;
- maxProbs += tx;
- labelLogProbs += tx;
- correctProbs += tx;
-
- const float maxp = maxProbs[0];
-
- /*
- * Compute the probability of guessing the correct case if you take the most-probable label.
- *
- * This is done like this:
- *
- * - If the most probable label is not equal to the true label, then the probability is zero.
- * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
- *
- * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
- * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
- * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
- */
- float crossEnt = 0.0f;
- int numMax = 0;
- bool correctLabel = false;
- for (int i = 0; i < numOut; i++) {
- const float label_prob = labels[i * numCases];
- const float model_prob = probs[i * numCases];
- numMax += model_prob == maxp;
- crossEnt += label_prob * safelog(model_prob);
- correctLabel |= model_prob == maxp && label_prob > 0.0f;
- }
- labelLogProbs[0] = crossEnt;
- if (!correctLabel) {
- correctProbs[0] = 0.0f;
- } else {
- correctProbs[0] = 1.0f / float(numMax);
- }
- }
-}
-
-/*
- * E = sum(p_l * log(y_l))
- * y_l: (numOut, numCases)
- * labels: (numOut, numCases)
- *
- * dE_dy_l: (numOut, numCases)
- */
-template <bool add>
-__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
- const int numOut, const float gradCoeff) {
- const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
- const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
- const int tidx = ty * numCases + tx;
-
- if (ty < numOut && tx < numCases) {
- const float label_prob = labels[tidx];
- const float model_prob = y_l[tidx];
- const float v = gradCoeff * __fdividef(label_prob, model_prob);
- if (add) {
- dE_dy_l[tidx] += v;
- } else {
- dE_dy_l[tidx] = v;
- }
- }
-}
-
-/*
- * E = sum(p_l * log(y_l))
- * y_l: (numOut, numCases)
- * labels: (numOut, numCases)
- *
- * dE_dx_l: (numOut, numCases)
- */
-template <bool add>
-__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
- const int numOut, const float gradCoeff) {
- const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
- const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
- const int tidx = ty * numCases + tx;
-
- if (ty < numOut && tx < numCases) {
- const float model_prob = y_l[tidx];
- const float label_prob = labels[tidx];
- float v = gradCoeff * (label_prob - model_prob);
- if (add) {
- dE_dx_l[tidx] += v;
- } else {
- dE_dx_l[tidx] = v;
- }
- }
-}
-
-/*
- * E = -log(y_t)
- * probs: (numOut, numCases)
- * labels: (1, numCases)
- * maxProbs: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- *
- * target: (1, numCases)
- */
-__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
- const int numCases, const int numOut) {
- const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
-
- if (tx < numCases) {
- const int label = int(labels[tx]);
- const float maxp = maxProbs[tx];
- const float labelp = probs[label * numCases + tx];
-
- labelLogProbs[tx] = __logf(labelp);
-
- /*
- * Compute the probability of guessing the correct case if you take the most-probable label.
- *
- * This is done like this:
- *
- * - If the most probable label is not equal to the true label, then the probability is zero.
- * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
- *
- * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
- * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
- * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
- */
- if (labelp != maxp) {
- correctProbs[tx] = 0;
- } else {
- int numMax = 0;
- for (int i = 0; i < numOut; i++) {
- numMax += probs[i * numCases + tx] == maxp;
- }
- correctProbs[tx] = 1.0f / float(numMax);
- }
- }
-}
-
-/*
- * E = -log(y_t)
- * y_l: (numOut, numCases)
- * labels: (1, numCases)
- *
- * dE_dy_l: (numOut, numCases)
- */
-template <bool add>
-__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
- const int numOut, const float gradCoeff) {
- const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
- const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
- const int tidx = ty * numCases + tx;
-
- if (ty < numOut && tx < numCases) {
- const int label = int(labels[tx]);
- float v = gradCoeff * (label == ty);
- v = __fdividef(v, y_l[tidx]);
- if (add) {
- dE_dy_l[tidx] += v;
- } else {
- dE_dy_l[tidx] = v;
- }
- }
-}
-
-/*
- * E = -log(y_t)
- * y_l: (numOut, numCases)
- * labels: (1, numCases)
- *
- * dE_dx_l: (numOut, numCases)
- */
-template <bool add>
-__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
- const int numOut, const float gradCoeff) {
- const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
- const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
- const int tidx = ty * numCases + tx;
-
- if (ty < numOut && tx < numCases) {
- const int label = int(labels[tx]);
- float v = gradCoeff * ((label == ty) - y_l[tidx]);
- if (add) {
- dE_dx_l[tidx] += v;
- } else {
- dE_dx_l[tidx] = v;
- }
- }
-}
-
-/*
- * dE_dy_l: (numOut, numCases)
- * y_l: (numOut, numCases)
- *
- * dE_dx_l: (numOut, numCases)
- */
-template <bool add>
-__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut, const float scaleTarget, const float scaleGrad) {
- const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
- const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
- const int tidx = ty * numCases + tx;
-
- if (ty < numOut && tx < numCases) {
- float v = 0;
- for (int j = 0; j < numOut; j++) {
- v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
- }
- v *= y_l[tidx];
-
- if (add) {
- dE_dx_l[tidx] = scaleTarget * dE_dx_l[tidx] + scaleGrad * v;
- } else {
- dE_dx_l[tidx] = scaleGrad * v;
- }
- }
-}
-
-template <int B_X, bool add>
-__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
- const int numElements) {
- for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
- if (add) {
- target[i] += actGrad[i] * (output[i] == input[i]);
- } else {
- target[i] = actGrad[i] * (output[i] == input[i]);
- }
- }
-}
-
-void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
- assert(actGrad.isContiguous());
- assert(output.isContiguous());
- assert(input.isContiguous());
- assert(actGrad.isSameDims(input));
- assert(actGrad.isSameDims(output));
-
- dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
- dim3 threads(128);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (add) {
- assert(actGrad.isSameDims(target));
- cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
- kEltwiseMaxGrad<128, true><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
- } else {
- target.resize(actGrad);
- cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
- kEltwiseMaxGrad<128, false><<<blocks, threads, 0, stream>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
- }
-
- getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
-}
-
-/*
- * E = sum_i{-p_i*log(y_i)}
- * probs: (numOut, numCases)
- * labels: (numOut, numCases)
- * maxProbs: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- *
- * target: (1, numCases)
- */
-void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
- int numCases = probs.getNumCols();
- int numOut = probs.getNumRows();
-
- assert(labels.isSameDims(probs));
- assert(!labels.isTrans());
- assert(!probs.isTrans());
- assert(labels.isContiguous());
- assert(probs.isContiguous());
-
- NVMatrix& maxProbs = probs.max(0);
-
- labelLogProbs_out.resize(1, numCases);
- correctProbs_out.resize(1, numCases);
- dim3 threads(LOGREG_ERR_THREADS_X, 1);
- dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
- kCrossEntCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
- labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
- numCases, numOut);
- getLastCudaError("kCrossEntCost: Kernel execution failed");
-
- delete &maxProbs;
-}
-
-void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
- int numCases = probs.getLeadingDim();
- int numOut = probs.getFollowingDim();
- assert(labels.isSameDims(probs));
- assert(probs.isContiguous());
- assert(target.isContiguous());
- assert(labels.isContiguous());
- assert(!labels.isTrans());
- assert(!probs.isTrans());
-
- dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
- dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (!add) {
- target.resize(probs);
- kCrossEntGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- } else {
- kCrossEntGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- }
-
- getLastCudaError("kCrossEntGrad: Kernel execution failed");
-}
-
-void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, float scaleTarget, float scaleGrad) {
- int numCases = acts.getLeadingDim();
- int numOut = acts.getFollowingDim();
-
- assert(acts.isSameDims(actsGrad));
- assert(acts.isContiguous());
- assert(actsGrad.isContiguous());
- assert(target.isContiguous());
- assert(acts.isTrans());
- assert(actsGrad.isTrans());
-
- dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
- dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
-
- if (scaleTarget == 0) {
- target.resize(acts);
- kSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
- } else {
- kSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut, scaleTarget, scaleGrad);
- }
- getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
-}
-
-void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
- int numCases = probs.getLeadingDim();
- int numOut = probs.getFollowingDim();
- assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
- assert(probs.isContiguous());
- assert(target.isContiguous());
- assert(labels.isContiguous());
- assert(probs.isTrans());
- assert(!labels.isTrans());
-
- dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
- dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (!add) {
- target.resize(probs);
- cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
- kCrossEntSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- } else {
- cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
- kCrossEntSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- }
- getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
-}
-
-/*
- * E = -log(y_t)
- * probs: (numOut, numCases)
- * labels: (1, numCases)
- * maxProbs: (1, numCases)
- * labelLogProbs: (1, numCases) (*out)
- * correctProbs: (1, numCases) (*out)
- *
- * target: (1, numCases) == log(y_l[labels,:]
- */
-void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& maxProbs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
- int numCases = probs.getNumCols();
- int numOut = probs.getNumRows();
-
- assert(labels.getNumElements() == numCases);
- assert(!labels.isTrans());
- assert(!probs.isTrans());
- assert(labels.isContiguous());
- assert(probs.isContiguous());
-
- labelLogProbs_out.resize(1, numCases);
- correctProbs_out.resize(1, numCases);
- dim3 threads(LOGREG_ERR_THREADS_X, 1);
- dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
- cudaStream_t stream = NVMatrix::getDefaultStream();
- cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
- kLogregCost<<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
- labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
- numCases, numOut);
- getLastCudaError("computeLogregCost: Kernel execution failed");
-}
-
-void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
- int numCases = probs.getLeadingDim();
- int numOut = probs.getFollowingDim();
- assert(labels.getNumElements() == numCases);
- assert(probs.isContiguous());
- assert(target.isContiguous());
- assert(labels.isContiguous());
- assert(!labels.isTrans());
- assert(!probs.isTrans());
-
- dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
- dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (!add) {
- target.resize(probs);
- kLogregCostGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- } else {
- kLogregCostGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- }
-
- getLastCudaError("computeLogregGrad: Kernel execution failed");
-}
-
-void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
- int numCases = probs.getLeadingDim();
- int numOut = probs.getFollowingDim();
- assert(labels.getNumElements() == numCases);
- assert(probs.isContiguous());
- assert(target.isContiguous());
- assert(labels.isContiguous());
- assert(probs.isTrans());
-
- dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
- dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
- cudaStream_t stream = NVMatrix::getDefaultStream();
- if (!add) {
- target.resize(probs);
- kLogregSoftmaxGrad<false><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- } else {
- kLogregSoftmaxGrad<true><<<blocks, threads, 0, stream>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
- numCases, numOut, coeff);
- }
-
- getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include "../include/lr.cuh"
-#include "../include/util.cuh"
-
-/*
- * ==================================
- * ParameterSchedule
- * ==================================
- */
-ParameterSchedule& ParameterSchedule::make(PyObject* schedDict) {
- std::string type = pyDictGetString(schedDict, "type");
- PyObject* paramsDict = PyDict_GetItemString(schedDict, "params");
- double base = pyDictGetFloat(paramsDict, "base");
- if (type == "const") {
- return *new ParameterSchedule(base);
- } else {
- double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
- if (type == "linear") {
- return *new LinearParameterSchedule(base, tgtFactor);
- } else if (type == "exp") {
- return *new ExpParameterSchedule(base, tgtFactor);
- } else if (type == "dexp") {
- double numSteps = pyDictGetInt(paramsDict, "numSteps");
- return *new DiscreteExpParameterSchedule(base, tgtFactor, numSteps);
- }
- }
- throw std::string("Unknown learning rate schedule type ") + type;
-}
-
-ParameterSchedule::ParameterSchedule(double baseRate)
- : _baseRate(baseRate) {
-}
-
-double ParameterSchedule::getValue(double progress) {
- return _baseRate;
-}
-
-double ParameterSchedule::getBaseValue() const {
- return _baseRate;
-}
-
-ParameterSchedule::~ParameterSchedule() {
-}
-
-/*
- * ==================================
- * LinearParameterSchedule
- * ==================================
- */
-LinearParameterSchedule::LinearParameterSchedule(double baseRate, double tgtFactor)
-: ParameterSchedule(baseRate) {
- _finalRate = baseRate / tgtFactor;
-}
-
-double LinearParameterSchedule::getValue(double progress) {
- return _baseRate * (1 - progress) + _finalRate * progress;
-}
-
-/*
- * ==================================
- * ExpParameterSchedule
- * ==================================
- */
-ExpParameterSchedule::ExpParameterSchedule(double baseRate, double tgtFactor)
-: ParameterSchedule(baseRate) {
- _powBase = 1.0 / tgtFactor;
-}
-
-double ExpParameterSchedule::getValue(double progress) {
- return _baseRate * std::pow(_powBase, progress);
-}
-
-/*
- * ==================================
- * DiscreteExpParameterSchedule
- * ==================================
- */
-DiscreteExpParameterSchedule::DiscreteExpParameterSchedule(double baseRate, double tgtFactor, int numSteps)
-: ParameterSchedule(baseRate) {
- ExpParameterSchedule elrs(baseRate, tgtFactor);
- double finalRate = baseRate / tgtFactor;
- for (int i = 0; i < numSteps - 1; i++) {
- double progress = double(i) / (numSteps - 1);
- _rates.push_back(elrs.getValue(progress));
- }
- _rates.push_back(finalRate);
- //printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
-}
-
-double DiscreteExpParameterSchedule::getValue(double progress) {
- for (int i = 0; i < _rates.size(); ++i) {
- if (progress <= double(i + 1) / _rates.size()) {
- return _rates[i];
- }
- }
- return _rates.back();
-}
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/memorysource.cuh"
-
-using namespace std;
-
-/*
- * =======================
- * MemoryView
- * =======================
- */
-MemoryView::MemoryView(MemorySource& src, std::string& name) : _src(&src), _name(name) {
-}
-
-MemoryView::~MemoryView() {
-// if (_src->truncate(_name)) {
-// delete _src;
-// }
-}
-
-NVMatrix& MemoryView::getMemory(int numCases) {
- return _src->getMemory(_name, numCases);
-}
-
-NVMatrix& MemoryView::getMemory() {
- return _src->getMemory(_name);
-}
-
-MemorySource& MemoryView::getMemorySource() {
- return *_src;
-}
-
-bool MemoryView::isParent() {
- return _src->getRange(_name).first == 0 && _src->getRange(_name).second == _src->getSize();
-}
-
-std::string& MemoryView::getName() {
- return _name;
-}
-
-MemoryView& MemoryView::clone(std::string& name) {
- return _src->addUser(name, _src->getRange(_name));
-}
-
-/*
- * =======================
- * MemorySource
- * =======================
- */
-MemorySource::MemorySource(int size, int deviceID) : _size(size), _deviceID(deviceID) {
-}
-
-MemorySource::~MemorySource() {
- // Each MemoryView is deleted by owner Layer, and the last one deletes the MemorySource.
- // So this is a no-op.
-}
-
-NVMatrix& MemorySource::getMemory(std::string& name) {
- return getMemory(name, _memory.getLeadingDim());
-}
-
-// Deletes old view when appropriate
-NVMatrix& MemorySource::getMemory(std::string& name, int numCases) {
- numCases = numCases < 0 ? _memory.getLeadingDim() : numCases;
- _lock.acquire();
- if (_memory.getLeadingDim() != numCases || _memory.getFollowingDim() != _size) {
- int d = NVMatrix::getDeviceID();
- NVMatrix::setDeviceID(_deviceID);
- _memory.resize(_size, numCases, false);
- for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
- delete it->second;
- }
- _memoryViews.clear();
- if (d >= 0) {
- NVMatrix::setDeviceID(d);
- }
- }
- if (_memoryViews.count(name) == 0) {
- assert(!_memory.isTrans());
- _memoryViews[name] = &_memory.sliceRows(_viewRanges[name].first, _viewRanges[name].second);
- }
- NVMatrix& view = *_memoryViews[name];
- assert(view.isContiguous());
- _lock.release();
- return view;
-}
-
-MemoryView& MemorySource::addUser(std::string& name, std::pair<int,int> range) {
- assert(_viewRanges.count(name) == 0);
- _viewRanges[name] = range;
- return *new MemoryView(*this, name);
-}
-
-MemoryView& MemorySource::addUser(std::string& name) {
- return addUser(name, std::pair<int,int>(0, _size));
-}
-
-MemoryView& MemorySource::make(int size, int deviceID, std::string& parentUser) {
- return (new MemorySource(size, deviceID))->addUser(parentUser);
-}
-
-pair<int,int> MemorySource::getRange(std::string& name) {
- return _viewRanges[name];
-}
-
-int MemorySource::getSize() {
- return _size;
-}
-
-bool MemorySource::truncate(std::string& name) {
- bool truncated = false;
- _lock.acquire();
- _truncateRequests.insert(name);
- if (_truncateRequests.size() == _viewRanges.size()) {
- for (map<std::string,NVMatrix*>::const_iterator it = _memoryViews.begin(); it != _memoryViews.end(); ++it) {
- delete it->second;
- }
- _memoryViews.clear();
- _memory.truncate();
- _truncateRequests.clear();
- truncated = true;
- }
- _lock.release();
- return truncated;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/neuron.cuh"
-#include "../include/util.cuh"
-
-using namespace std;
-
-Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
- std::string type = pyDictGetString(neuronDict, "type");
- PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
-
- if (type == "relu") {
- return *new ReluNeuron();
- }
-
- if (type == "drelu") {
- return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
- }
-
- if (type == "softrelu") {
- return *new SoftReluNeuron();
- }
-
- if (type == "brelu") {
- return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
- }
-
- if (type == "abs") {
- return *new AbsNeuron();
- }
-
- if (type == "logistic") {
- return *new LogisticNeuron();
- }
-
- if (type == "tanh") {
- return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
- }
-
- if (type == "square") {
- return *new SquareNeuron();
- }
-
- if (type == "sqrt") {
- return *new SqrtNeuron();
- }
-
- if (type == "linear") {
- return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
- }
-
- if (type == "log") {
- return *new LogNeuron(pyDictGetFloat(neuronParamsDict, "a"));
- }
-
- if (type == "ident") {
- return *new Neuron();
- }
-
- throw std::string("Unknown neuron type: ") + type;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <Python.h>
-#include <arrayobject.h>
-#include <assert.h>
-#include <helper_cuda.h>
-#include <cublas.h>
-#include <time.h>
-#include <vector>
-#include <execinfo.h>
-#include <signal.h>
-
-#include "../../util/include/matrix.h"
-#include "../../util/include/queue.h"
-#include "../include/worker.cuh"
-#include "../include/util.cuh"
-#include "../include/cost.cuh"
-
-#include "../include/pyconvnet.cuh"
-#include "../include/convnet.cuh"
-
-#include "../include/jpeg.h"
-
-using namespace std;
-static ConvNet* model = NULL;
-
-static PyMethodDef _ConvNetMethods[] = {{ "initModel", initModel, METH_VARARGS },
- { "startBatch", startBatch, METH_VARARGS },
- { "finishBatch", finishBatch, METH_VARARGS },
- { "checkGradients", checkGradients, METH_VARARGS },
- { "startMultiviewTest", startMultiviewTest, METH_VARARGS },
- { "startFeatureWriter", startFeatureWriter, METH_VARARGS },
- { "startDataGrad", startDataGrad, METH_VARARGS },
- { "syncWithHost", syncWithHost, METH_VARARGS },
- { "decodeJpeg", decodeJpeg, METH_VARARGS },
- { NULL, NULL }
-};
-
-void init_ConvNet() {
- (void) Py_InitModule("_ConvNet", _ConvNetMethods);
- import_array();
-}
-
-void signalHandler(int sig) {
- const size_t max_trace_size = 40;
- void *array[max_trace_size];
- size_t trace_size = backtrace(array, max_trace_size);
- fprintf(stderr, "Error signal %d:\n", sig);
- backtrace_symbols_fd(array, trace_size, STDERR_FILENO);
- exit(1);
-}
-
-PyObject* initModel(PyObject *self, PyObject *args) {
- assert(model == NULL);
- signal(SIGSEGV, signalHandler);
- signal(SIGABRT, signalHandler);
-
- PyDictObject* pyLayerParams;
- PyListObject* pyDeviceIDs;
- int pyMinibatchSize;
- int conserveMem;
-
- if (!PyArg_ParseTuple(args, "O!O!ii",
- &PyDict_Type, &pyLayerParams,
- &PyList_Type, &pyDeviceIDs,
- &pyMinibatchSize,
- &conserveMem)) {
- return NULL;
- }
- intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs);
-
- model = new ConvNet((PyObject*)pyLayerParams,
- deviceIDs,
- pyMinibatchSize,
- conserveMem);
-
- model->start();
- return Py_BuildValue("i", 0);
-}
-
-/*
- * Starts training/testing on the given batch (asynchronous -- returns immediately).
- */
-PyObject* startBatch(PyObject *self, PyObject *args) {
- assert(model != NULL);
-// printf("starting next batch\n");
- PyListObject* data;
- double progress;
- int test = 0;
- if (!PyArg_ParseTuple(args, "O!d|i",
- &PyList_Type, &data,
- &progress,
- &test)) {
- return NULL;
- }
- CPUData* cpuData = new CPUData((PyObject*)data);
-
- TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test);
- model->getWorkerQueue().enqueue(wr);
- return Py_BuildValue("i", 0);
-}
-
-/*
- * Starts testing on the given batch (asynchronous -- returns immediately).
- */
-PyObject* startMultiviewTest(PyObject *self, PyObject *args) {
- assert(model != NULL);
- PyListObject* data;
- int numViews;
- PyArrayObject* pyProbs = NULL;
- char* logregName = NULL;
- if (!PyArg_ParseTuple(args, "O!i|O!s",
- &PyList_Type, &data,
- &numViews,
- &PyArray_Type, &pyProbs,
- &logregName)) {
- return NULL;
- }
- CPUData* cpuData = new CPUData((PyObject*)data);
- MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews)
- : new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName);
- model->getWorkerQueue().enqueue(wr);
- return Py_BuildValue("i", 0);
-}
-
-PyObject* startFeatureWriter(PyObject *self, PyObject *args) {
- assert(model != NULL);
- PyListObject* data;
- PyListObject* pyFtrs;
- PyListObject* pyLayerNames;
- if (!PyArg_ParseTuple(args, "O!O!O!",
- &PyList_Type, &data,
- &PyList_Type, &pyFtrs,
- &PyList_Type, &pyLayerNames)) {
- return NULL;
- }
- stringv* layerNames = getStringV((PyObject*)pyLayerNames);
- CPUData* cpuData = new CPUData((PyObject*)data);
- MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs);
-
- FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames);
- model->getWorkerQueue().enqueue(wr);
- return Py_BuildValue("i", 0);
-}
-
-PyObject* startDataGrad(PyObject *self, PyObject *args) {
-// assert(model != NULL);
-// PyListObject* data;
-// int dataLayerIdx, softmaxLayerIdx;
-// if (!PyArg_ParseTuple(args, "O!ii",
-// &PyList_Type, &data,
-// &dataLayerIdx, &softmaxLayerIdx)) {
-// return NULL;
-// }
-// CPUData* cpuData = new CPUData((PyObject*)data);
-// Matrix& ftrs = *mvec.back();
-// mvec.pop_back();
-//
-// DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx);
-// model->getWorkerQueue().enqueue(wr);
- return Py_BuildValue("i", 0);
-}
-
-/*
- * Waits for the trainer to finish training on the batch given to startBatch.
- * This is a blocking call so lets release the GIL.
- */
-PyObject* finishBatch(PyObject *self, PyObject *args) {
- assert(model != NULL);
- WorkResult* res = model->getResultQueue().dequeue();
- assert(res != NULL);
- assert(res->getResultType() == WorkResult::BATCH_DONE);
-
- Cost& cost = res->getResults();
- PyObject* dict = PyDict_New();
- CostMap& costMap = cost.getCostMap();
- for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) {
- PyObject* v = PyList_New(0);
- for (vector<double>::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) {
- PyObject* f = PyFloat_FromDouble(*iv);
- PyList_Append(v, f);
- }
- PyDict_SetItemString(dict, it->first.c_str(), v);
- }
- PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases());
- delete res; // Deletes cost too
-
- return retVal;
-}
-
-PyObject* checkGradients(PyObject *self, PyObject *args) {
- assert(model != NULL);
- PyListObject* data;
- if (!PyArg_ParseTuple(args, "O!",
- &PyList_Type, &data)) {
- return NULL;
- }
- CPUData* cpuData = new CPUData((PyObject*)data);
-
- GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData);
- model->getWorkerQueue().enqueue(wr);
- WorkResult* res = model->getResultQueue().dequeue();
- assert(res != NULL);
- assert(res->getResultType() == WorkResult::BATCH_DONE);
- delete res;
- return Py_BuildValue("i", 0);
-}
-
-/*
- * Copies weight matrices from GPU to system memory.
- */
-PyObject* syncWithHost(PyObject *self, PyObject *args) {
- assert(model != NULL);
- SyncWorker* wr = new SyncWorker(*model);
- model->getWorkerQueue().enqueue(wr);
- WorkResult* res = model->getResultQueue().dequeue();
- assert(res != NULL);
- assert(res->getResultType() == WorkResult::SYNC_DONE);
-
- delete res;
- return Py_BuildValue("i", 0);
-}
-
-PyObject* decodeJpeg(PyObject *self, PyObject *args) {
- PyListObject* pyJpegStrings;
- PyArrayObject* pyTarget;
- int img_size, inner_size, test, multiview;
- if (!PyArg_ParseTuple(args, "O!O!iiii",
- &PyList_Type, &pyJpegStrings,
- &PyArray_Type, &pyTarget,
- &img_size,
- &inner_size,
- &test,
- &multiview)) {
- return NULL;
- }
-
- Thread* threads[NUM_JPEG_DECODER_THREADS];
- int num_imgs = PyList_GET_SIZE(pyJpegStrings);
- int num_imgs_per_thread = DIVUP(num_imgs, NUM_JPEG_DECODER_THREADS);
- Matrix& dstMatrix = *new Matrix(pyTarget);
- for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) {
- int start_img = t * num_imgs_per_thread;
- int end_img = min(num_imgs, (t+1) * num_imgs_per_thread);
-
- threads[t] = new DecoderThread((PyObject*)pyJpegStrings, dstMatrix, start_img, end_img, img_size, inner_size, test, multiview);
- threads[t]->start();
- }
-
- for (int t = 0; t < NUM_JPEG_DECODER_THREADS; ++t) {
- threads[t]->join();
- delete threads[t];
- }
- assert(dstMatrix.isView());
- delete &dstMatrix;
- return Py_BuildValue("i", 0);
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-#include <map>
-#include "../include/reducepipeline.cuh"
-
-using namespace std;
-
-/* =========================
- * IReducerSegment
- * =========================
- */
-// Null mat --> reducer on host
-IReduceSegment::IReduceSegment(IEightGPUReducer& parent, int deviceID, Queue<int>* finishQueue)
-: _deviceID(deviceID), _next(NULL), _finishQueue(finishQueue), Thread(true, getDeviceCPUs(parent.getTgtDeviceID())) {
-}
-
-IReduceSegment::~IReduceSegment() {
-}
-
-NVMatrix& IReduceSegment::getChunk(const NVMatrix& mat, int chunkSize, int chunkIdx) {
- NVMatrix& line = mat.reshaped(1, mat.getNumElements());
- int start = chunkIdx * chunkSize;
- int end = min((chunkIdx+1) * chunkSize, mat.getNumElements());
-// _mat->printShape("_mat");
- NVMatrix& chunk = line.sliceCols(start, end);
- delete &line;
-// chunk.printShape("chunk");
- return chunk;
-}
-
-void* IReduceSegment::run() {
- bool exit = false;
- while (!exit) {
- ReduceMessage& msg = *_queue.dequeue();
- if (msg.getType() == EXIT) {
- exit = true;
- } else {
- bool term = processMessage(msg);
- if (term) {
- assert(_finishQueue);
- _finishQueue->enqueue(1);
- }
- }
- delete &msg;
- }
- return NULL;
-}
-
-inline NVMatrix& IReduceSegment::getMatrix(ReduceMessage& msg) {
- return msg.getMatrix(getDeviceID());
-}
-
-Queue<ReduceMessage*>& IReduceSegment::getQueue() {
- return _queue;
-}
-
-inline int IReduceSegment::getDeviceID() const {
- return _deviceID;
-}
-
-void IReduceSegment::addPrev(IReduceSegment& c) {
- _prev.push_back(&c);
-}
-
-void IReduceSegment::addNext(ReducePeer& c) {
- assert(_next == NULL);
- _next = &c;
- c.addPrev(*this);
-}
-
-bool IReduceSegment::isTerminal() const {
- return _next == NULL;
-}
-
-/* =========================
- * ReducerSource
- * =========================
- */
-ReducerSource::ReducerSource(IEightGPUReducer& parent, int deviceID) : IReduceSegment(parent, deviceID, NULL) {
-}
-
-bool ReducerSource::processMessage(ReduceMessage& msg) {
- assert(msg.getType() == REDUCE_START);
- int numChunks = min(getMatrix(msg).getNumElements(), max(REDUCE_MIN_CHUNKS, min(REDUCE_MAX_CHUNKS, DIVUP(getMatrix(msg).getNumElements(), REDUCE_MIN_CHUNK_SIZE))));
- int chunkSize = DIVUP(getMatrix(msg).getNumElements(), numChunks);
- //printf("num chunks: %d\n", numChunks);
- for (int c = 0; c <= numChunks; ++c) {
- _next->getQueue().enqueue(new ReduceChunkMessage(*this, c, chunkSize, numChunks, msg.getScaleIntermediates(), msg.getScaleTarget(), msg.getMatrices()));
- }
- return false;
-}
-
-/* =========================
- * ReducerPeer
- * =========================
- */
-ReducePeer::ReducePeer(IEightGPUReducer& parent,int deviceID, Queue<int>* finishQueue) : IReduceSegment(parent, deviceID, finishQueue), _numInputsFinished(0) {
- _add = deviceID != DEVICE_HOST;
-}
-
-ReducePeer::ReducePeer(IEightGPUReducer& parent) : IReduceSegment(parent, DEVICE_HOST, NULL), _numInputsFinished(0), _add(false) {
-}
-
-ReducePeer::~ReducePeer() {
- for(std::map<int,cudaStream_t>::iterator it = _streams.begin(); it != _streams.end(); ++it) {
- checkCudaErrors(cudaStreamDestroy(it->second));
- }
- _streams.clear();
-}
-
-inline cudaStream_t ReducePeer::getStream(int deviceID) {
- if (deviceID < 0) {
- return NULL;
- }
- if (_streams.count(deviceID) == 0) {
- NVMatrix::setDeviceID(deviceID);
- checkCudaErrors(cudaStreamCreateWithFlags(&_streams[deviceID], cudaStreamNonBlocking));
- }
- return _streams[deviceID];
-}
-
-bool ReducePeer::processMessage(ReduceMessage& msg) {
- assert(msg.getType() == REDUCE_CHUNK);
-
- ReduceChunkMessage& cmsg = *static_cast<ReduceChunkMessage*>(&msg);
-// if (_numInputsReceived.count(cmsg.getChunkIdx()) == 0) {
-// _numInputsReceived[cmsg.getChunkIdx()] = 0;
-// }
- int& inputsRcvd = ++_numInputsReceived[cmsg.getChunkIdx()];
-// printf("reducer on device %d got msg chunk idx %d of %d, inputs rcvd for this chunk idx: %d/%d\n",
-// getDeviceID(), cmsg.getChunkIdx(), cmsg.getNumChunks(),_numInputsReceived[cmsg.getChunkIdx()], _prev.size());
- if (cmsg.getChunkIdx() < cmsg.getNumChunks()) {
- IReduceSegment& src = cmsg.getSource();
- float scalePrev = isTerminal() ? cmsg.getScaleIntermediates() : 1;
- float scaleSelf = inputsRcvd == 1 ? _add * (isTerminal() ? cmsg.getScaleTarget() : 1): 1;
- if (scaleSelf == 0 || isTerminal()) {
- if (getDeviceID() >= 0) {
- NVMatrix::setDeviceID(getDeviceID());
- }
- getMatrix(msg).resize(src.getMatrix(msg));
- }
- assert(getMatrix(msg).isSameDims(src.getMatrix(msg)));
- NVMatrix& prevChunk = getChunk(src.getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
- NVMatrix& myChunk = getChunk(getMatrix(msg), cmsg.getChunkSize(), cmsg.getChunkIdx());
- int execDeviceID = getDeviceID() >= 0 ? getDeviceID() : src.getDeviceID();
- if (execDeviceID >= 0) {
- NVMatrix::setDeviceID(execDeviceID);
- prevChunk.add(myChunk, scalePrev, scaleSelf, myChunk, getStream(execDeviceID));
- NVMatrix::syncStream(getStream(execDeviceID));
- } else {
- assert(!isTerminal());
- hostAdd(prevChunk.getDevData(), myChunk.getDevData(), prevChunk.getNumElements(), scaleSelf);
- }
-
- delete &prevChunk;
- delete &myChunk;
-
- } else {
- _numInputsFinished++;
- }
- if (!isTerminal() && inputsRcvd == _prev.size()) {
-// printf(" device %d enqueueing msg for next on device %d\n", getDeviceID(), _next->getDeviceID());
- _next->getQueue().enqueue(
- new ReduceChunkMessage(*this, cmsg.getChunkIdx(), cmsg.getChunkSize(), cmsg.getNumChunks(),
- cmsg.getScaleIntermediates(), cmsg.getScaleTarget(), cmsg.getMatrices()));
- }
-
- bool finished = _numInputsFinished == _prev.size();
- if (finished) {
- _numInputsFinished = 0;
- _numInputsReceived.clear();
- }
- return finished && isTerminal();
-}
-
-void ReducePeer::hostAdd(const float* src, float* tgt, const int n, const float scaleTgt) {
- if (scaleTgt != 0) {
- for (int i = 0; i < n; ++i) {
- tgt[i] = scaleTgt * tgt[i] + src[i];
- }
- } else {
- for (int i = 0; i < n; ++i) {
- tgt[i] = src[i];
- }
- }
-}
-
-inline NVMatrix& ReducePeer::getMatrix(ReduceMessage& msg) {
- if (getDeviceID() != DEVICE_HOST) {
- return IReduceSegment::getMatrix(msg);
- }
- return _mat;
-}
-
-/* =========================
- * EightGPUReducer
- * =========================
- */
-IEightGPUReducer::IEightGPUReducer(int tgtDeviceID) : _tgtDeviceID(tgtDeviceID) {
-}
-
-IEightGPUReducer::~IEightGPUReducer() {
- vector<IReduceSegment*> v;
- v.insert(v.end(), _sources.begin(), _sources.end());
- v.insert(v.end(), _peers.begin(), _peers.end());
- for (vector<IReduceSegment*>::iterator it = v.begin(); it != v.end(); ++it) {
- (*it)->getQueue().enqueue(new ReduceMessage(EXIT));
- (*it)->join();
- delete *it;
- }
-}
-
-IEightGPUReducer& IEightGPUReducer::construct() {
- vector<int> same, other;
- for (int i = 0; i < 8; ++i) {
- if (i != _tgtDeviceID) {
- if (NVMatrix::canAccessPeer(_tgtDeviceID, i)) {
- same.insert(same.begin() + rand() % (1 + same.size()), i);
- } else {
- other.insert(other.begin() + rand() % (1 + other.size()), i);
- }
- }
- }
- assert(same.size() == 3);
- assert(other.size() == 4);
- makeConnections(same, other);
- for (vector<ReducerSource*>::const_iterator it = _sources.begin(); it != _sources.end(); ++it) {
- (*it)->start();
- }
- for (vector<ReducePeer*>::const_iterator it = _peers.begin(); it != _peers.end(); ++it) {
- (*it)->start();
- }
- return *this;
-}
-
-void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates, float scaleTarget) {
- assert(mats.size() == 8);
- // Check if source matrices are 0-sized
- bool zero = true;
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- if (it->first != _tgtDeviceID && it->second->getNumElements() != 0) {
- zero = false;
- break;
- }
- }
- if (zero) {
- mats[_tgtDeviceID]->resize(*mats[(_tgtDeviceID + 1) % 8]);
- } else {
- for (vector<ReducerSource*>::const_iterator it = _sources.begin(); it != _sources.end(); ++it) {
- (*it)->getQueue().enqueue(new ReduceStartMessage(scaleIntermediates, scaleTarget, mats));
- }
- _finishQueue.dequeue();
- }
- assert(_finishQueue.getNumElements() == 0);
-}
-
-void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats, float scaleIntermediates) {
- reduce(mats, scaleIntermediates, 1);
-}
-
-void IEightGPUReducer::reduce(std::map<int, NVMatrix*>& mats) {
- reduce(mats, 1, 1);
-}
-
-int IEightGPUReducer::getTgtDeviceID() const {
- return _tgtDeviceID;
-}
-
-/* =========================
- * EightGPUReducer1
- * =========================
- */
-EightGPUReducer1::EightGPUReducer1(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) {
-}
-
-void EightGPUReducer1::makeConnections(vector<int>& same, vector<int>&other) {
- // Setup segments on same truck
- _peers.push_back(new ReducePeer(*this, _tgtDeviceID, &_finishQueue)); // peers[0] = tgt
- _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1
- _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2
- _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3
-
- _sources[0]->addNext(*_peers[2]);
- _peers[2]->addNext(*_peers[1]);
- _peers[1]->addNext(*_peers[0]);
-
- // Setup segments on other truck
- _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1
- _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2
- _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3
- _sources.push_back(new ReducerSource(*this,other[3])); // sources[2] = other truck 4
- _peers.push_back(new ReducePeer(*this)); // peers[5] = host 1
- _peers.push_back(new ReducePeer(*this)); // peers[6] = host 2
- _peers.push_back(new ReducePeer(*this)); // peers[7] = host 3
-
- _sources[1]->addNext(*_peers[3]);
- _peers[3]->addNext(*_peers[5]);
- _peers[5]->addNext(*_peers[7]);
- _peers[7]->addNext(*_peers[0]);
- _peers[4]->addNext(*_peers[6]);
- _peers[6]->addNext(*_peers[7]);
- _sources[2]->addNext(*_peers[4]);
-}
-
-/* =========================
- * EightGPUReducer2
- * =========================
- */
-EightGPUReducer2::EightGPUReducer2(int tgtDeviceID) : IEightGPUReducer(tgtDeviceID) {
-}
-
-void EightGPUReducer2::makeConnections(vector<int>& same, vector<int>&other) {
- // Setup segments on same truck
- _peers.push_back(new ReducePeer(*this,_tgtDeviceID, &_finishQueue)); // peers[0] = tgt
- _peers.push_back(new ReducePeer(*this,same[0], &_finishQueue)); // peers[1] = same truck 1
- _peers.push_back(new ReducePeer(*this,same[1], &_finishQueue)); // peers[2] = same truck 2
- _sources.push_back(new ReducerSource(*this,same[2])); // sources[0] = same truck 3
-
- _sources[0]->addNext(*_peers[2]);
- _peers[2]->addNext(*_peers[1]);
- _peers[1]->addNext(*_peers[0]);
-
- // Setup segments on other truck
- _sources.push_back(new ReducerSource(*this,other[0])); // sources[1] = other truck 1
- _peers.push_back(new ReducePeer(*this,other[1], &_finishQueue)); // peers[3] = other truck 2
- _peers.push_back(new ReducePeer(*this,other[2], &_finishQueue)); // peers[4] = other truck 3
- _peers.push_back(new ReducePeer(*this,other[3], &_finishQueue)); // peers[5] = other truck 4
- _peers.push_back(new ReducePeer(*this)); // peers[6] = host 1
-
- _sources[1]->addNext(*_peers[3]);
- _peers[3]->addNext(*_peers[4]);
- _peers[4]->addNext(*_peers[5]);
- _peers[5]->addNext(*_peers[6]);
- _peers[6]->addNext(*_peers[0]);
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/streambroadcast.cuh"
-
-using namespace std;
-
-/*
- * =====================
- * StreamBroadcast
- * =====================
- */
-
-StreamBroadcast::StreamBroadcast(map<int,cudaStream_t>& streams) {
- _streams = streams;
-}
-
-StreamBroadcast::StreamBroadcast() {
-}
-
-void StreamBroadcast::toHostMem(NVMatrix& src, NVMatrix& hostmem, int srcDevice) {
- src.copy(hostmem, _streams[srcDevice]);
-}
-
-void StreamBroadcast::toTarget(NVMatrix& hostmem, NVMatrix& tgt, int tgtDevice, float scaleTarget, float scaleOutput) {
- tgt.add(hostmem, scaleTarget, scaleOutput, tgt, _streams[tgtDevice]);
-}
-
-void StreamBroadcast::init(map<int, NVMatrix*>& mats) {
- for (map<int, NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- if (_streams.count(it->first) == 0) {
- _ownedStreams.insert(it->first);
- NVMatrix::setDeviceID(it->first);
- checkCudaErrors(cudaStreamCreateWithFlags(&_streams[it->first], cudaStreamNonBlocking));
- }
- }
-}
-
-StreamBroadcast::~StreamBroadcast() {
- for (set<int>::const_iterator it = _ownedStreams.begin(); it != _ownedStreams.end(); ++it) {
- checkCudaErrors(cudaStreamDestroy(_streams[*it]));
- }
-}
-
-cudaStream_t StreamBroadcast::getStream(int deviceID) {
- return _streams[deviceID];
-}
-
-// Sync stream associated with given device id
-void StreamBroadcast::sync(int deviceID) {
- NVMatrix::syncStream(_streams[deviceID]);
-}
-
-void StreamBroadcast::transfer(map<int,NVMatrix*>& mats, int srcDevice) {
- transfer(mats, _hostMem, srcDevice, 0, 1);
-}
-
-void StreamBroadcast::transfer(map<int,NVMatrix*>& mats, int srcDevice, float scaleTarget, float scaleOutput) {
- transfer(mats, _hostMem, srcDevice, scaleTarget, scaleOutput);
-}
-
-void StreamBroadcast::transfer(map<int,NVMatrix*>& mats, HostNVMatrix& hostbuf, int srcDevice, float scaleTarget, float scaleOutput) {
- int oldDeviceID = NVMatrix::getDeviceID();
- assert(mats.count(srcDevice) != 0);
- init(mats);
-// assert(_streams.count(srcDevice) != 0);
- if (mats.size() > 1) {
- if (mats[srcDevice]->getNumElements() == 0) {
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- it->second->resize(*mats[srcDevice]);
- }
- } else {
- int tgtDevice = mats.begin()->first != srcDevice ? mats.begin()->first : (++mats.begin())->first;
- // This case is a simple copy
- if (mats.size() == 2 && NVMatrix::canAccessPeer(tgtDevice, srcDevice)) {
- NVMatrix::setDeviceID(tgtDevice);
- mats[tgtDevice]->add(*mats[srcDevice], scaleTarget, scaleOutput, *mats[tgtDevice], _streams[tgtDevice]);
- } else {
- NVMatrix& src = *mats[srcDevice];
- if (hostbuf.getNumElements() < src.getNumElements()) {
- hostbuf.resize(1,src.getNumElements());
- }
- hostbuf.setTrans(src.isTrans());
-
- NVMatrix& hostmat = hostbuf.sliceCols(0, src.getNumElements());
- assert(hostmat.isView());
- hostmat.reshape(src.getNumRows(), src.getNumCols());
-
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- assert(it->second->isContiguous());
- NVMatrix::setDeviceID(it->first);
- it->second->resize(src);
- assert(it->second->isTrans() == src.isTrans());
- }
- int numChunks = min(DIVUP(src.getNumElements(), SB_MIN_CHUNK_SIZE), SB_MAX_CHUNKS);
-
- if (numChunks == 1) { // This is a bit faster for small matrices
- NVMatrix::setDeviceID(srcDevice);
- toHostMem(src, hostmat, srcDevice);
- NVMatrix::syncStream(_streams[srcDevice]);
-
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- if (it->first != src.getDataDeviceID()) {
- NVMatrix::setDeviceID(it->first);
- toTarget(hostmat, *it->second, it->first, scaleTarget, scaleOutput);
- }
- }
- } else {
- int n = src.getNumElements();
-
- map<int,NVMatrix*> lines;
- for (map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- lines[it->first] = &it->second->reshaped(1, n);
- lines[it->first]->setTrans(src.isTrans());
- }
- NVMatrix& srcLine = *lines[srcDevice];
- hostmat.reshape(1, n);
-
- int chunkSize = DIVUP(n, numChunks);
- bool trans = src.isTrans();
- for (int i = 0; i < numChunks; ++i) {
- int start = i * chunkSize;
- int end = min((i+1) * chunkSize, n);
- if (start < end) {
- NVMatrix& tmpSrc = srcLine.sliceCols(start, end); // view
- NVMatrix& tmpHostmem = hostmat.sliceCols(start, end); // view
-
- NVMatrix::setDeviceID(srcDevice);
- toHostMem(tmpSrc, tmpHostmem, srcDevice);
- NVMatrix::syncStream(_streams[srcDevice]);
-
- for (map<int,NVMatrix*>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
- if (it->first != srcDevice) {
- NVMatrix& tmpTgt = it->second->sliceCols(start, end); // view
- NVMatrix::setDeviceID(it->first);
- toTarget(tmpHostmem, tmpTgt, it->first, scaleTarget, scaleOutput);
- delete &tmpTgt;
- }
- }
- delete &tmpSrc;
- delete &tmpHostmem;
- }
- }
- for (map<int,NVMatrix*>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
- delete it->second;
- }
- }
- delete &hostmat;
- }
- for(map<int,NVMatrix*>::const_iterator it = mats.begin(); it != mats.end(); ++it) {
- if (it->first != srcDevice) {
- NVMatrix::syncStream(_streams[it->first]);
- }
- }
- }
- }
- if (oldDeviceID >= 0) {
- NVMatrix::setDeviceID(oldDeviceID);
- }
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <Python.h>
-#include <arrayobject.h>
-#include <helper_cuda.h>
-#include "../include/util.cuh"
-
-using namespace std;
-
-stringv* getStringV(PyObject* pyList) {
- if (pyList == NULL) {
- return NULL;
- }
- stringv* vec = new stringv();
- for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
- vec->push_back(std::string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i))));
- }
- return vec;
-}
-
-floatv* getFloatV(PyObject* pyList) {
- if (pyList == NULL) {
- return NULL;
- }
- floatv* vec = new floatv();
- for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
- vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i)));
- }
- return vec;
-}
-
-intv* getIntV(PyObject* pyList) {
- if (pyList == NULL) {
- return NULL;
- }
- intv* vec = new intv();
- for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
- vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)));
- }
- return vec;
-}
-
-int* getIntA(PyObject* pyList) {
- if (pyList == NULL) {
- return NULL;
- }
- int* arr = new int[PyList_GET_SIZE(pyList)];
- for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
- arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i));
- }
- return arr;
-}
-
-MatrixV* getMatrixV(PyObject* pyList) {
- return getMatrixV(pyList, PyList_GET_SIZE(pyList));
-}
-
-MatrixV* getMatrixV(PyObject* pyList, int len) {
- if (pyList == NULL) {
- return NULL;
- }
- MatrixV* vec = new MatrixV();
- for (int i = 0; i < len; i++) {
- vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i)));
- }
- return vec;
-}
-
-PyObjectV* pyDictGetValues(PyObject* dict) {
- PyObjectV* pov = new PyObjectV();
- PyObject* valuesList = PyDict_Values(dict);
- int numValues = PyList_GET_SIZE(valuesList);
-
- for (int i = 0; i < numValues; i++) {
- pov->push_back(PyList_GET_ITEM(valuesList, i));
- }
- Py_DECREF(valuesList);
- return pov;
-}
-
-int pyDictGetInt(PyObject* dict, const char* key) {
- return PyInt_AS_LONG(PyDict_GetItemString(dict, key));
-}
-
-intv* pyDictGetIntV(PyObject* dict, const char* key) {
- return getIntV(PyDict_GetItemString(dict, key));
-}
-
-int* pyDictGetIntA(PyObject* dict, const char* key) {
- return getIntA(PyDict_GetItemString(dict, key));
-}
-
-std::string pyDictGetString(PyObject* dict, const char* key) {
- return std::string(PyString_AS_STRING(PyDict_GetItemString(dict, key)));
-}
-
-float pyDictGetFloat(PyObject* dict, const char* key) {
- return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key));
-}
-
-floatv* pyDictGetFloatV(PyObject* dict, const char* key) {
- return getFloatV(PyDict_GetItemString(dict, key));
-}
-
-Matrix* pyDictGetMatrix(PyObject* dict, const char* key) {
- return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key));
-}
-
-MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) {
- return getMatrixV(PyDict_GetItemString(dict, key));
-}
-
-stringv* pyDictGetStringV(PyObject* dict, const char* key) {
- return getStringV(PyDict_GetItemString(dict, key));
-}
-
-bool pyDictHasKey(PyObject* dict, const char* key) {
- PyObject* str = PyString_FromString(key);
- bool b = PyDict_Contains(dict, str);
- Py_DECREF(str);
- return b;
-}
-
-template<typename T>
-void shuffleVector(vector<T>& v, int start, int end) {
- const int len = end - start;
- for (int i = 0; i < len*5; ++i) {
- int r1 = start + rand() % len;
- int r2 = start + rand() % len;
- int tmp = v[r1];
- v[r1] = v[r2];
- v[r2] = tmp;
- }
-}
-
-template<class T>
-std::string tostr(T n) {
- ostringstream result;
- result << n;
- return result.str();
-}
-
-template<class T>
-void deleteElements(vector<T*>& v) {
- deleteElements(v, false);
-}
-
-template<class T>
-void deleteElements(vector<T*>& v, bool deleteContainer) {
- for (typename vector<T*>::const_iterator it = v.begin(); it != v.end(); ++it) {
- delete *it;
- }
- if (deleteContainer) {
- delete &v;
- }
-}
-
-static Lock deviceCPULock;
-static std::map<int, std::vector<int> > deviceCPUs;
-
-std::vector<int>& getDeviceCPUs(int deviceID) {
- deviceCPULock.acquire();
- if (deviceCPUs.count(deviceID) == 0 && deviceID >= 0) {
- struct cudaDeviceProp props;
- checkCudaErrors(cudaGetDeviceProperties(&props, deviceID));
- char pciString[13];
-
- sprintf(pciString, "%04x", props.pciDomainID);
- pciString[4] = ':';
- sprintf(pciString + 5, "%02x", props.pciBusID);
- pciString[7] = ':';
- sprintf(pciString + 8, "%02x", props.pciDeviceID);
- pciString[10] = '.';
- pciString[11] = '0';
- pciString[12] = 0;
- std::string path = std::string("/sys/bus/pci/devices/") + std::string(pciString) + "/local_cpulist";
- ifstream f(path.c_str());
-
- if (f.is_open()) {
- std::string cpuString;
- while (getline(f, cpuString, ',')) {
- int start, end;
- int found = sscanf(cpuString.c_str(), "%d-%d", &start, &end);
- end = found == 1 ? start : end;
- if (found > 0) {
- for (int i = start; i <= end; ++i) {
- deviceCPUs[deviceID].push_back(i);
- }
- }
- }
- f.close();
- } else {
- printf("Unable to open %s\n", path.c_str());
- }
- }
- vector<int>& ret = deviceCPUs[deviceID];
- deviceCPULock.release();
- return ret;
-}
-
-template void shuffleVector<int>(std::vector<int>& v, int start, int end);
-template std::string tostr<int>(int n);
-template void deleteElements<NVMatrix>(std::vector<NVMatrix*>& v, bool deleteContainer);
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <map>
-#include <algorithm>
-#include "../include/weights.cuh"
-#include "../include/lr.cuh"
-#include "../include/worker.cuh"
-
-using namespace std;
-
-/* ========================
- * IWeightReducer
- * ========================
- */
-int IWeightReducer::getDeviceID() {
- return _replicas[_tgtReplicaID]->getDeviceID();
-}
-
-IWeightReducer::IWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : _replicas(replicas), _tgtReplicaID(tgtReplicaID) {
-}
-
-IWeightReducer::~IWeightReducer() {
-}
-
-IWeightReducer& IWeightReducer::make(std::map<int,Weights*>& replicas, int tgtReplicaID) {
- if (replicas.size() == 8) {
- return *new ParallelWeightReducer(replicas, tgtReplicaID);
- }
- return *new SequentialWeightReducer(replicas, tgtReplicaID);
-}
-
-/* ========================
- * SequentialWeightReducer
- * ========================
- */
-SequentialWeightReducer::SequentialWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) {
- _sb = new StreamBroadcast();
-}
-
-SequentialWeightReducer::~SequentialWeightReducer() {
- delete _sb;
-}
-
-void SequentialWeightReducer::reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) {
- std::map<int, NVMatrix*> mats; // device id -> grad
- mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad();
- for (int i = 0, r = _tgtReplicaID; i < _replicas.size(); ++i, r = (r + 1) % _replicas.size()) {
- if (r != _tgtReplicaID) {
- mats[_replicas[r]->getDeviceID()] = gradShards[r];
- _sb->transfer(mats, _replicas[r]->getDeviceID(), 1, gradScale);
- mats.erase(_replicas[r]->getDeviceID());
- }
- }
-}
-
-/* ========================
- * ParallelWeightReducer
- * ========================
- */
-ParallelWeightReducer::ParallelWeightReducer(std::map<int,Weights*>& replicas, int tgtReplicaID) : IWeightReducer(replicas, tgtReplicaID) {
- _reducer = &(new EightGPUReducer1(getDeviceID()))->construct();
-}
-
-ParallelWeightReducer::~ParallelWeightReducer() {
- delete _reducer;
-}
-
-void ParallelWeightReducer::reduce(std::map<int, NVMatrix*> gradShards, float gradScale, bool toInc) {
- std::map<int, NVMatrix*> mats; // device id -> grad
- mats[getDeviceID()] = toInc ? &_replicas[_tgtReplicaID]->getInc() : &_replicas[_tgtReplicaID]->getGrad();
- for (std::map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- if (it->first != _tgtReplicaID) {
- mats[it->second->getDeviceID()] = gradShards[it->first];
- }
- }
- _reducer->reduce(mats, gradScale, 1);
-}
-
-// weights has pointer to layer, layer pointer to thread
-// thread has sync (copy) object for every other thread
-// weights uses copy object to sum grad contributions into inc matrix slice (phase 1)
-// weights broadcasts inc matrix slice to other inc matrix replicas (phase 2)
-
-NVMatrix& Weights::operator*() const {
- return getW();
-}
-
-/*
- * TODO: get rid of this constructor duplication.
- */
-Weights::Weights(Weights& srcWeights, ParameterSchedule& lrs, Layer& parent) {
- init(srcWeights.getCPUW(), srcWeights.getCPUWInc(), lrs, parent, 0, 0, srcWeights.getMom(), srcWeights.isUseGrad(), false);
- _srcWeights = &srcWeights;
-}
-
-Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc,
- float wball, float mom, bool useGrad) {
- init(hWeights, hWeightsInc, lrs, parent, wc, wball, mom, useGrad, true);
-}
-
-void Weights::init(Matrix& hWeights, Matrix& hWeightsInc, ParameterSchedule& lrs, Layer& parent, float wc,
- float wball, float mom, bool useGrad, bool cleanup) {
- _srcWeights = NULL;
- _hWeights = &hWeights;
- _hWeightsInc = &hWeightsInc;
- _numUpdates = 0;
- _lrs = &lrs;
- _parent = &parent;
- _wc = wc;
- _wball = wball;
- _mom = mom;
- _useGrad = useGrad;
- _onGPU = false;
- _weights = NULL;
- _weightsInc = NULL;
- _weightsGrad = NULL;
- _cleanup = cleanup;
- _reducer = NULL;
- _broadcaster = NULL;
-}
-
-Weights::~Weights() {
- delete _lrs;
- delete _reducer;
- delete _broadcaster;
- if (_cleanup) {
- delete _hWeights;
- delete _hWeightsInc;
- if (_srcWeights == NULL) {
- delete _weights;
- delete _weightsInc;
- delete _weightsGrad;
- }
- }
-}
-
-NVMatrix& Weights::getW() const {
- assert(_onGPU);
- return *_weights;
-}
-
-NVMatrix& Weights::getInc() const {
- assert(_onGPU);
- return *_weightsInc;
-}
-
-/*
- * TODO: This seems like pretty nasty behavior, I should change this.
- */
-NVMatrix& Weights::getGrad() const {
- assert(_onGPU);
- return _useGrad ? *_weightsGrad : *_weightsInc;
-}
-
-Matrix& Weights::getCPUW() const {
- return *_hWeights;
-}
-
-Matrix& Weights::getCPUWInc() const {
- return *_hWeightsInc;
-}
-
-int Weights::getNumRows() const {
- return _hWeights->getNumRows();
-}
-
-int Weights::getNumCols() const {
- return _hWeights->getNumCols();
-}
-
-map<int,Weights*>& Weights::getReplicas() {
- return _replicas;
-}
-
-template<class T> T& Weights::getShard(T& mat, int replicaID) {
- const int n = mat.getNumElements();
- T& line = mat.reshaped(1, n);
- const int shardStart = min(n, replicaID * _shardSize);
- const int shardEnd = min(n, (replicaID + 1) * _shardSize);
- T& slice = line.sliceCols(shardStart, shardEnd);
- assert(slice.isView());
- delete &line;
- return slice;
-}
-
-template<class T> T& Weights::getShard(T& mat) {
- return getShard(mat, getReplicaID());
-}
-
-ISafeBroadcastNetwork& Weights::getBroadcaster() {
- if (_broadcaster == NULL) {
- set<int> devices;
- for (map<int, Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- devices.insert(it->second->getDeviceID());
- }
- // NOTE: we must use safe broadcaster becasue we want to *add* our value to everyone else
- _broadcaster = &ISafeBroadcastNetwork::make(devices, getDeviceID()); //&(new NaiveBroadcaster(devices, getDeviceID()))->construct();
- }
- return *_broadcaster;
-}
-
-IWeightReducer& Weights::getReducer() {
- if (_reducer == NULL) {
- _reducer = &IWeightReducer::make(_replicas, getReplicaID());
- }
- return *_reducer;
-}
-
-void Weights::copyToCPU() {
- if (_srcWeights == NULL) {
- assert(_onGPU);
- NVMatrix::syncStream(); // for safety
- if (getReplicaID() == 0) {
- _weights->copyToHost(*_hWeights);
-
- // Synchronize weights amongst replicas while we're at it.
- map<int,NVMatrix*> weights;
- for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- weights[it->second->getDeviceID()] = &it->second->getW();
- }
- // These things sync before returning.
- getBroadcaster().broadcast(weights, 1, 0);
- }
- if (_useGrad) {
- Matrix& hIncShard = getShard(*_hWeightsInc);
- _weightsInc->copyToHost(hIncShard);
- delete &hIncShard;
- } else { // In this case there's definitely only one replica
- _weightsInc->copyToHost(*_hWeightsInc);
- }
- }
-}
-
-// This function is assumed to be called in the order in which the layers
-// were defined
-void Weights::copyToGPU() {
- assert(!_onGPU);
- // Copies are performed on the default (computation) stream, so that's fine.
- if (_srcWeights == NULL) {
- _weights = _weights == NULL ? new NVMatrix() : _weights;
- _weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc;
- _weights->copyFromHost(*_hWeights, true);
-
- if (_useGrad) {
- // In this case there is no need to store the entire inc matrix.
- // Just this replica's shard (for synchronization purposes) will do.
- Matrix& hIncShard = getShard(*_hWeightsInc);
- _weightsInc->copyFromHost(hIncShard, true);
- delete &hIncShard;
- } else {
- _weightsInc->copyFromHost(*_hWeightsInc, true);
- }
-
- _weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL;
- } else {
- _weights = _srcWeights->_weights;
- _weightsInc = _srcWeights->_weightsInc;
- _weightsGrad = _srcWeights->_weightsGrad;
- }
- _onGPU = true;
-}
-
-void Weights::aggregateReplicaGradients(float progress) {
- map<int, NVMatrix*> gradShards;
- map<int, NVMatrix*> wShards;
- for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- gradShards[it->first] = &getShard(it->second->getGrad(), getReplicaID());
- wShards[it->first] = &getShard(it->second->getW(), getReplicaID());
- assert(wShards[it->first]->isContiguous() && gradShards[it->first]->isContiguous());
- }
-
- float gradScale = _lrs->getValue(progress);
- NVMatrix::setDeviceID(getDeviceID());
-
- if (_wc > 0) {
- NVMatrixTernaryOps::WeightedAdd wadd = NVMatrixTernaryOps::WeightedAdd(_mom, gradScale, -_wc * _lrs->getValue(progress));
- _weightsInc->applyTernary(wadd, *gradShards[getReplicaID()], *wShards[getReplicaID()], *_weightsInc);
- } else {
- _weightsInc->add(*gradShards[getReplicaID()], _mom, gradScale);
- }
-
- // Reduce everyone's gradient into my inc shard
- NVMatrix::syncStream(); // Crucial since the reducer does everything in its own streams!!
- getReducer().reduce(gradShards, gradScale, true);
-
- // Broadcast my inc -> all replicas
- map<int, NVMatrix*> mats; // device id -> grad
- mats[getDeviceID()] = _weightsInc;
- for (map<int, Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- if (it->first != getReplicaID()) {
- mats[it->second->getDeviceID()] = wShards[it->first];
- }
- }
- getBroadcaster().broadcast(mats, 1, 1);
-
- NVMatrix::setDeviceID(getDeviceID());
- wShards[getReplicaID()]->add(*_weightsInc);
-
- // Cleanup
- for (map<int,Weights*>::const_iterator it = _replicas.begin(); it != _replicas.end(); ++it) {
- delete gradShards[it->first];
- delete wShards[it->first];
- }
-}
-
-
-// When _useGrad is false, weightsInc is assumed to contain the
-// entire, properly scaled weight increment.
-// OTHERWISE, scale your gradient by 1 / numCases only.
-// The scaling by epsW will be done in this routine.
-void Weights::update(float progress) {
- // Only true owner of weights updates
-// printf("%s update weights\n", _parent->getName().c_str());
- if (_srcWeights == NULL && _lrs->getBaseValue() > 0) {
- assert(_onGPU);
- if (_useGrad) {
- aggregateReplicaGradients(progress);
- } else { // Definitely no replicas in this case
- if (_wc > 0) {
- _weightsInc->add(*_weights, -_wc * _lrs->getValue(progress));
- }
- _weights->add(*_weightsInc);
- }
- _numUpdates = 0;
- }
-}
-
-int Weights::incNumUpdates() {
- if (_srcWeights != NULL) {
- return _srcWeights->incNumUpdates();
- }
- return _numUpdates++;
-}
-
-// Returns the number of times a gradient has been computed for this
-// weight matrix during the current pass (interval between two calls of update())
-// through the net. This number will only be greater than 1 if this weight matrix
-// is *shared* by multiple layers in the net.
-int Weights::getNumUpdates() const {
- if (_srcWeights != NULL) {
- return _srcWeights->getNumUpdates();
- }
- return _numUpdates;
-}
-
-float Weights::getEps(float progress) const {
- return _lrs->getValue(progress);
-}
-
-float Weights::getMom() const {
- return _mom;
-}
-
-float Weights::getWC() const {
- return _wc;
-}
-
-float Weights::getWBall() const {
- return _wball;
-}
-
-bool Weights::isUseGrad() const { // is good grammar
- return _useGrad;
-}
-
-bool Weights::isOwner() const {
- return _srcWeights == NULL;
-}
-
-ParameterSchedule& Weights::getLearningRateSchedule() const {
- return *_lrs;
-}
-
-void Weights::addReplica(Weights& replica) {
- _replicas[replica.getReplicaID()] = &replica;
-
- const int n = _hWeights->getNumElements();
- _shardSize = DIVUP(n, _replicas.size());
-}
-
-int Weights::getReplicaID() {
- return _parent->getReplicaID();
-}
-
-int Weights::getDeviceID() {
- return _parent->getDeviceID();
-}
-
-Layer& Weights::getParent() {
- return *_parent;
-}
-
-/*
- * ===============
- * WeightList
- * ===============
- */
-Weights& WeightList::operator[](const int i) const {
- return *_weightList[i];
-}
-
-Weights& WeightList::at(const int i) const {
- return *_weightList[i];
-}
-
-WeightList::~WeightList() {
- for (int i = 0; i < _weightList.size(); i++) {
- delete _weightList[i];
- }
-}
-
-WeightList::WeightList() {
-}
-
-void WeightList::addWeights(Weights& w) {
- _weightList.push_back(&w);
-}
-
-
-void WeightList::update(float progress) {
- for (int i = 0; i < getSize(); i++) {
- _weightList[i]->update(progress);
- }
-}
-
-void WeightList::copyToCPU() {
- for (int i = 0; i < getSize(); i++) {
- _weightList[i]->copyToCPU();
- }
-}
-
-void WeightList::copyToGPU() {
- for (int i = 0; i < getSize(); i++) {
- _weightList[i]->copyToGPU();
- }
-}
-
-int WeightList::getSize() const {
- return _weightList.size();
-}
-
-void WeightList::addReplica(WeightList& replica) {
- for (int i = 0; i < getSize(); i++) {
- _weightList[i]->addReplica(replica[i]);
- }
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include "../include/util.cuh"
-#include "../include/worker.cuh"
-#include "../include/timer.cuh"
-
-using namespace std;
-
-/*
- * ====================
- * WorkResult
- * ====================
- */
-WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) {
-}
-
-WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) {
-}
-
-WorkResult::~WorkResult() {
- delete _results; // delete NULL is ok
-}
-
-Cost& WorkResult::getResults() const {
- return *_results;
-}
-
-WorkResult::RESULTS WorkResult::getResultType() const {
- return _resultType;
-}
-
-/*
- * ====================
- * Worker
- * ====================
- */
-Worker::Worker(ConvNet& convNet) : _convNet(&convNet) {
-}
-
-Worker::~Worker() {
-}
-
-/*
- * ====================
- * DataWorker
- * ====================
- */
-DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data), _dp(NULL) {
- assert(_data != NULL);
-}
-
-bool DataWorker::run() {
- _dp = &_convNet->getDataProvider();
- _dp->setData(*_data);
- _run();
- _dp->clearData();
- return false;
-}
-
-DataWorker::~DataWorker() {
-}
-
-/*
- * ====================
- * TrainingWorker
- * ====================
- */
-TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test)
- : DataWorker(convNet, data), _progress(progress), _test(test) {
-}
-
-void TrainingWorker::_run() {
- _convNet->setTrainingProgress(_progress);
- Cost& batchCost = *new Cost();
- int numMinibatches = _dp->getNumMinibatches();
- for (int i = 0; i < numMinibatches; i++) {
- for (int p = 0; p < _convNet->getNumPasses(); p++) {
- _convNet->fprop(i, p, _test ? PASS_TEST : PASS_TRAIN);
- _convNet->getCost(batchCost);
-
- if (!_test) {
- _convNet->bprop(p, PASS_TRAIN);
- _convNet->updateWeights(p);
- }
- }
- }
- _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
-}
-
-/*
- * ====================
- * SyncWorker
- * ====================
- */
-SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) {
-}
-
-bool SyncWorker::run() {
- _convNet->copyToCPU();
- _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE));
- return false;
-}
-
-/*
- * ====================
- * ExitWorker
- * ====================
- */
-ExitWorker::ExitWorker(ConvNet& convNet) : Worker(convNet) {
-}
-
-bool ExitWorker::run() {
- return true;
-}
-
-/*
- * ====================
- * GradCheckWorker
- * ====================
- */
-GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data)
- : DataWorker(convNet, data) {
-}
-
-void GradCheckWorker::_run() {
- _convNet->checkGradients();
- exit(0); // eh
-}
-
-/*
- * ====================
- * MultiviewTestWorker
- * ====================
- */
-MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName)
- : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) {
-// assert(_data->getNumCases() % _numViews == 0);
-// assert(convNet.getNumReplicas() == 1); // For now?
-}
-
-MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews)
- : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") {
-// assert(_data->getNumCases() % _numViews == 0);
-}
-
-MultiviewTestWorker::~MultiviewTestWorker() {
-// delete _cpuProbs;
-}
-
-CPUData& MultiviewTestWorker::getMinibatch(int v, int i) {
- int numCasesPerView = _dp->getNumCases() / _numViews;
- int miniStart = v * numCasesPerView + i * _dp->getMinibatchSize();
- int miniEnd = v * numCasesPerView + min(numCasesPerView, (i + 1) * _dp->getMinibatchSize());
- CPUData& mini = _dp->getDataSlice(miniStart, miniEnd);
- return mini;
-}
-
-void MultiviewTestWorker::_run() {
- int numCasesPerView = _dp->getNumCases() / _numViews;
- int numMiniPerView = DIVUP(numCasesPerView, _dp->getMinibatchSize());
-
- Cost& batchCost = *new Cost();
- for (int i = 0; i < numMiniPerView; i++) {
- for (int v = 0; v < _numViews - 1; v++) {
- for (int p = 0; p < _convNet->getNumPasses(); p++) {
- _convNet->fprop(getMinibatch(v, i), p, v == 0 ? PASS_MULTIVIEW_TEST_START : PASS_MULTIVIEW_TEST);
- }
- }
- for (int p = 0; p < _convNet->getNumPasses(); p++) {
- _convNet->fprop(getMinibatch(_numViews - 1, i), p, PASS_MULTIVIEW_TEST_END);
- _convNet->getCost(batchCost);
- }
-// if (_cpuProbs != NULL) {
-// LogregCostLayer& logregLayer = *dynamic_cast<LogregCostLayer*>(&_convNet->getLayer(_logregName, 0));
-// NVMatrix::setDeviceID(logregLayer.getDeviceID());
-// Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(),
-// min(numCasesReal, (i + 1) * _dp->getMinibatchSize()));
-// NVMatrix& acts = logregLayer.getProbsAccum();
-// NVMatrix acts_T;
-// acts.transpose(acts_T);
-// acts_T.copyToHost(miniProbs);
-//
-// delete &miniProbs;
-// }
- }
- _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
-}
-
-/*
- * ====================
- * FeatureWorker
- * ====================
- */
-FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames, bool deleteFeatures)
- : DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames), _deleteFeatures(deleteFeatures) {
- assert(layerNames.size() == ftrs.size());
- for (int i = 0; i < layerNames.size(); i++) {
- assert(ftrs[i]->getNumRows() == data.getNumCases());
- assert(!ftrs[i]->isTrans());
- }
-}
-
-FeatureWorker::~FeatureWorker() {
- if (_deleteFeatures) {
- for (int i = 0; i < _ftrs->size(); i++) {
- delete _ftrs->at(i);
- }
- delete _ftrs;
- }
- delete _layerNames;
-}
-
-void FeatureWorker::_run() {
- Cost& batchCost = *new Cost();
- map<int,int> repStart; // Feature write start offsets within minibatch
- for (int i = 0; i < _dp->getNumMinibatches(); i++) {
- for (int f = 0; f < _layerNames->size(); f++) {
- repStart[f] = 0;
- }
-
- for (int p = 0; p < _convNet->getNumPasses(); p++) {
- _convNet->fprop(i, p, PASS_FEATURE_GEN);
- _convNet->getCost(batchCost);
- for (int f = 0; f < _layerNames->size(); f++) {
-
- if (_convNet->getLayer(_layerNames->at(f), 0).getFwdActiveInputReplicaIdx(p) >= 0) {
- Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(),
- min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
-
- for (int r = 0; r < _convNet->getLayer(_layerNames->at(f), 0).getNumReplicas(); ++r) {
- Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f), r);
- int d = ftrLayer.getDeviceID();
- NVMatrix::setDeviceID(d);
- NVMatrix& acts = ftrLayer.getActs();
-
- Matrix& repMiniFtrs = miniFtrs.sliceRows(repStart[f],
- min(int(miniFtrs.getNumRows()), repStart[f] + acts.getLeadingDim()));
-
- NVMatrix acts_T;
- acts.transpose(false);
- acts.transpose(acts_T);
- acts_T.copyToHost(repMiniFtrs);
- NVMatrix::syncStream(); // eh why not
-
- delete &repMiniFtrs;
-
- repStart[f] += acts.getLeadingDim();
- }
- delete &miniFtrs;
- }
- }
- }
- }
- _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
-}
-
-/*
- * ====================
- * DataGradWorker
- * ====================
- */
-DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx)
- : DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) {
-// assert(dataGrads.getNumRows() == data.getNumCases());
-// assert(!dataGrads.isTrans());
-}
-
-DataGradWorker::~DataGradWorker() {
-// delete _dataGrads;
-}
-
-void DataGradWorker::_run() {
-// DataLayer& dataLayer = *dynamic_cast<DataLayer*>(&_convNet->getLayer(_dataLayerIdx));
-// SoftmaxLayer& softmaxLayer = *dynamic_cast<SoftmaxLayer*>(&_convNet->getLayer(_softmaxLayerIdx));
-// softmaxLayer.setDoLogregGrad(false);
-// Cost& batchCost = *new Cost(0);
-// for (int i = 0; i < _dp->getNumMinibatches(); i++) {
-// _convNet->fprop(i, PASS_TEST);
-// _convNet->getCost(batchCost);
-// softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad());
-//
-// softmaxLayer.getActsGrad().addScalar(1);
-// softmaxLayer.getActsGrad().scale(-1);
-// softmaxLayer.incRcvdBInputs();
-// softmaxLayer.bprop(PASS_TEST);
-//
-// Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(),
-// min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
-// NVMatrix& grads = dataLayer.getActsGrad();
-// NVMatrix grads_T;
-// if (grads.isTrans()) {
-// NVMatrix& soft_T = grads.getTranspose();
-// soft_T.transpose(grads_T);
-// delete &soft_T;
-// } else {
-// grads.transpose(grads_T);
-// }
-// grads_T.copyToHost(miniDataGrads);
-// delete &miniDataGrads;
-//
-// _convNet->reset();
-// }
-// cudaThreadSynchronize();
-// _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
-}
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from python_util.gpumodel import *
-import numpy as n
-import numpy.random as nr
-
-def get_src(filename):
- src = IGPUModel.load_checkpoint(filename)
- return src['model_state']['layers']
-
-# Initialize weight matrix by copying weight matrix of given layer
-def makew(name, idx, shape, params):
- src = get_src(params[0])
- return src[name]['weights'][idx]
-
-# Initialize bias vector by copying bias vector of given layer
-def makeb(name, shape, params):
- src = get_src(params[0])
- return src[name]['biases']
-
-def concat(shape, src, src_layers, src_func):
- mat = n.empty(shape, dtype=n.single, order='F')
- start = 0
- for s in src_layers:
- m = src_func(src[s])
- mat[:,start:start+m.shape[1]] = m
- start += m.shape[1]
- return mat
-
-# Initialize weight matrix by concatenating weight matrices of given layers
-def makewcat(name, idx, shape, params):
- src, src_layers = get_src(params[0]), params[1:]
- return concat(shape, src, src_layers, lambda x: x['weights'][idx])
-
-# Initialize bias vector by concatenating bias vectors of given layers
-def makebcat(name, shape, params):
- src, src_layers = get_src(params[0]), params[1:]
- return concat(shape, src, src_layers, lambda x: x['biases'])
-
-# Initialize bias vector from tuple input
-def makeb_vec(name, shape, params):
- return n.array([n.single(x) for x in params], dtype=n.single).reshape((1, len(params)))
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from math import exp
-import sys
-import ConfigParser as cfg
-import os
-import numpy as n
-import numpy.random as nr
-from math import ceil, floor
-from collections import OrderedDict
-from os import linesep as NL
-from python_util.options import OptionsParser
-import re
-
-class LayerParsingError(Exception):
- pass
-
-# A neuron that doesn't take parameters
-class NeuronParser:
- def __init__(self, type, func_str, uses_acts=True, uses_inputs=True):
- self.type = type
- self.func_str = func_str
- self.uses_acts = uses_acts
- self.uses_inputs = uses_inputs
-
- def parse(self, type):
- if type == self.type:
- return {'type': self.type,
- 'params': {},
- 'usesActs': self.uses_acts,
- 'usesInputs': self.uses_inputs}
- return None
-
-# A neuron that takes parameters
-class ParamNeuronParser(NeuronParser):
- neuron_regex = re.compile(r'^\s*(\w+)\s*\[\s*(\w+(\s*,\w+)*)\s*\]\s*$')
- def __init__(self, type, func_str, uses_acts=True, uses_inputs=True):
- NeuronParser.__init__(self, type, func_str, uses_acts, uses_inputs)
- m = self.neuron_regex.match(type)
- self.base_type = m.group(1)
- self.param_names = m.group(2).split(',')
- assert len(set(self.param_names)) == len(self.param_names)
-
- def parse(self, type):
- m = re.match(r'^%s\s*\[([\d,\.\s\-]*)\]\s*$' % self.base_type, type)
- if m:
- try:
- param_vals = [float(v.strip()) for v in m.group(1).split(',')]
- if len(param_vals) == len(self.param_names):
- return {'type': self.base_type,
- 'params': dict(zip(self.param_names, param_vals)),
- 'usesActs': self.uses_acts,
- 'usesInputs': self.uses_inputs}
- except TypeError:
- pass
- return None
-
-class AbsTanhNeuronParser(ParamNeuronParser):
- def __init__(self):
- ParamNeuronParser.__init__(self, 'abstanh[a,b]', 'f(x) = a * |tanh(b * x)|')
-
- def parse(self, type):
- dic = ParamNeuronParser.parse(self, type)
- # Make b positive, since abs(tanh(bx)) = abs(tanh(-bx)) and the C++ code
- # assumes b is positive.
- if dic:
- dic['params']['b'] = abs(dic['params']['b'])
- return dic
-
-class ParamParser:
- lrs_regex = re.compile(r'^\s*(\w+)\s*(?:\[\s*(\w+(\s*;\w+)*)\s*\])?\s*$')
- param_converters = {'i': int,
- 'f': float}
- def __init__(self, type):
- m = self.lrs_regex.match(type)
- self.base_type = m.group(1)
- param_names_with_type = m.group(2).split(';') if m.group(2) is not None else []
- self.param_names = [p[1:] for p in param_names_with_type]
- self.param_types = [self.param_converters[p[0]] for p in param_names_with_type]
- self.param_regex_inner = ";".join([('\s*%s\s*=\s*[^;,\s=]+\s*' % p) for p in self.param_names])
- self.regex_str = ('^%s\s*(?:\[(%s)\])?\s*$') % (self.base_type, self.param_regex_inner)
- assert len(set(self.param_names)) == len(self.param_names)
-
- def parse(self, type):
- m = re.match(self.regex_str, type, flags=re.IGNORECASE)
- if m:
- try:
- param_vals = [ptype(v.split('=')[1].strip()) for ptype,v in zip(self.param_types, m.group(1).split(';'))] if m.group(1) is not None else []
- if len(param_vals) == len(self.param_names):
- return {'type': self.base_type,
- 'params': dict(zip(self.param_names, param_vals))}
- except TypeError:
- pass
- return None
-
-# Subclass that throws more convnet-specific exceptions than the default
-class MyConfigParser(cfg.SafeConfigParser):
- def safe_get(self, section, option, f=cfg.SafeConfigParser.get, typestr=None, default=None):
- try:
- return f(self, section, option)
- except cfg.NoOptionError, e:
- if default is not None:
- return default
- raise LayerParsingError("Layer '%s': required parameter '%s' missing" % (section, option))
- except ValueError, e:
- if typestr is None:
- raise e
- raise LayerParsingError("Layer '%s': parameter '%s' must be %s" % (section, option, typestr))
-
- def safe_get_list(self, section, option, f=str, typestr='strings', default=None):
- v = self.safe_get(section, option, default=default)
- if type(v) == list:
- return v
- try:
- return [f(x.strip()) for x in v.split(',')]
- except:
- raise LayerParsingError("Layer '%s': parameter '%s' must be ','-delimited list of %s" % (section, option, typestr))
-
- def safe_get_int(self, section, option, default=None):
- return self.safe_get(section, option, f=cfg.SafeConfigParser.getint, typestr='int', default=default)
-
- def safe_get_float(self, section, option, default=None):
- return self.safe_get(section, option, f=cfg.SafeConfigParser.getfloat, typestr='float', default=default)
-
- def safe_get_bool(self, section, option, default=None):
- return self.safe_get(section, option, f=cfg.SafeConfigParser.getboolean, typestr='bool', default=default)
-
- def safe_get_float_list(self, section, option, default=None):
- return self.safe_get_list(section, option, float, typestr='floats', default=default)
-
- def safe_get_int_list(self, section, option, default=None):
- return self.safe_get_list(section, option, int, typestr='ints', default=default)
-
- def safe_get_bool_list(self, section, option, default=None):
- return self.safe_get_list(section, option, lambda x: x.lower() in ('true', '1'), typestr='bools', default=default)
-
-# A class that implements part of the interface of MyConfigParser
-class FakeConfigParser(object):
- def __init__(self, dic):
- self.dic = dic
-
- def safe_get(self, section, option, default=None):
- if option in self.dic:
- return self.dic[option]
- return default
-
- def safe_get_int(self, section, option, default=None):
- return int(self.safe_get(section, option, default))
-
- def safe_get_int_list(self, section, option, default=None):
- return list(self.safe_get(section, option, default))
-
-class LayerParser:
- def __init__(self):
- self.dic = {}
- self.set_defaults()
-
- # Post-processing step -- this is called after all layers have been initialized
- def optimize(self, layers):
- self.dic['actsTarget'] = -1
- self.dic['actsGradTarget'] = -1
- if len(set(len(l['gpu']) for l in layers.values() if 'inputs' in l and self.dic['name'] in l['inputs'])) > 1:
-# print set(len(l['gpu']) for l in layers.values())
- raise LayerParsingError("Layer '%s': all next layers must have equal number of replicas." % (self.dic['name']))
-
- def parse_params(self, vals, parsers, param_name, human_name, num_params=1):
- dic, name = self.dic, self.dic['name']
-
-# print vals
- if len(vals) != num_params and len(vals) != 1:
- raise LayerParsingError("Layer '%s': expected list of length %d for %s but got list of length %d."% (name, num_params, param_name, len(vals)))
- parsed = []
-# print vals
- for v in vals:
- for p in parsers:
- parsedv = p.parse(v)
- if parsedv:
- parsed += [parsedv]
- break
- if len(parsed) == 1 and num_params > 1:
- parsed = parsed * num_params
- if len(parsed) == num_params:
- return parsed
-# print parsed, vals
- raise LayerParsingError("Layer '%s': unable to parse %s %s=%s." % (name, human_name, param_name, ",".join(vals)))
-
- # Add parameters from layer parameter file
- def add_params(self, mcp):
- pass
-# self.dic['conserveMem'] = mcp.convnet.op.get_value('conserve_mem') if mcp.convnet is not None else 0
-
- def init(self, dic):
- self.dic = dic
- return self
-
- def set_defaults(self):
- self.dic['outputs'] = 0
- self.dic['parser'] = self
- self.dic['requiresParams'] = False
- # Does this layer use its own activity matrix
- # for some purpose other than computing its output?
- # Usually, this will only be true for layers that require their
- # own activity matrix for gradient computations. For example, layers
- # with logistic units must compute the gradient y * (1 - y), where y is
- # the activity matrix.
- #
- # Layers that do not not use their own activity matrix should advertise
- # this, since this will enable memory-saving matrix re-use optimizations.
- #
- # The default value of this property is True, for safety purposes.
- # If a layer advertises that it does not use its own activity matrix when
- # in fact it does, bad things will happen.
- self.dic['usesActs'] = True
-
- # Does this layer use the activity matrices of its input layers
- # for some purpose other than computing its output?
- #
- # Again true by default for safety
- self.dic['usesInputs'] = True
-
- # Force this layer to use its own activity gradient matrix,
- # instead of borrowing one from one of its inputs.
- #
- # This should be true for layers where the mapping from output
- # gradient to input gradient is non-elementwise.
- self.dic['forceOwnActs'] = True
-
- # Does this layer need the gradient at all?
- # Should only be true for layers with parameters (weights).
- self.dic['gradConsumer'] = False
-
- # The gpu indices on which this layer runs
- self.dic['gpu'] = [-1]
-
- def parse(self, name, mcp, prev_layers, model=None):
- self.prev_layers = prev_layers
- self.dic['name'] = name
- self.dic['type'] = mcp.safe_get(name, 'type')
- self.dic['id'] = len(prev_layers)
-
- return self.dic
-
- def verify_float_range(self, v, param_name, _min, _max):
- self.verify_num_range(v, param_name, _min, _max, strconv=lambda x: '%.3f' % x)
-
- def verify_num_range(self, v, param_name, _min, _max, strconv=lambda x:'%d' % x):
- if type(v) == list:
- for i,vv in enumerate(v):
- self._verify_num_range(vv, param_name, _min, _max, i, strconv=strconv)
- else:
- self._verify_num_range(v, param_name, _min, _max, strconv=strconv)
-
- def _verify_num_range(self, v, param_name, _min, _max, input=-1, strconv=lambda x:'%d' % x):
- layer_name = self.dic['name'] if input < 0 else '%s[%d]' % (self.dic['name'], input)
- if _min is not None and _max is not None and (v < _min or v > _max):
- raise LayerParsingError("Layer '%s': parameter '%s' must be in the range %s-%s" % (layer_name, param_name, strconv(_min), strconv(_max)))
- elif _min is not None and v < _min:
- raise LayerParsingError("Layer '%s': parameter '%s' must be greater than or equal to %s" % (layer_name, param_name, strconv(_min)))
- elif _max is not None and v > _max:
- raise LayerParsingError("Layer '%s': parameter '%s' must be smaller than or equal to %s" % (layer_name, param_name, strconv(_max)))
-
- def verify_divisible(self, value, div, value_name, div_name=None, input_idx=0):
- layer_name = self.dic['name'] if len(self.dic['inputs']) == 0 else '%s[%d]' % (self.dic['name'], input_idx)
- if value % div != 0:
- raise LayerParsingError("Layer '%s': parameter '%s' must be divisible by %s" % (layer_name, value_name, str(div) if div_name is None else "'%s'" % div_name))
-
- def verify_str_in(self, value, param_name, lst, input_idx=-1):
- lname = self.dic['name'] if input_idx == -1 else ('%s[%d]' % (self.dic['name'], input_idx))
- if value not in lst:
- raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (lname, param_name, ", ".join("'%s'" % s for s in lst)))
-
- def verify_int_in(self, value, param_name, lst):
- if value not in lst:
- raise LayerParsingError("Layer '%s': parameter '%s' must be one of %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst)))
-
- def verify_all_ints_in(self, values, param_name, lst):
- if len([v for v in values if v not in lst]) > 0:
- raise LayerParsingError("Layer '%s': all parameters to '%s' must be among %s" % (self.dic['name'], param_name, ", ".join("'%d'" % s for s in lst)))
-
- def verify_input_dims(self, dims):
- for i,d in enumerate(dims):
- if d is not None and self.dic['numInputs'][i] != d: # first input must be labels
- raise LayerParsingError("Layer '%s': dimensionality of input %d must be %d" % (self.dic['name'], i, d))
-
- # This looks for neuron=x arguments in various layers, and creates
- # separate layer definitions for them.
- @staticmethod
- def detach_neuron_layers(layers):
- for name,l in layers.items():
- if l['type'] != 'neuron' and 'neuron' in l and l['neuron']:
- NeuronLayerParser().detach_neuron_layer(name, layers)
-
- @staticmethod
- def parse_layers(layer_cfg_path, param_cfg_path, model, layers={}):
- try:
- if not os.path.exists(layer_cfg_path):
- raise LayerParsingError("Layer definition file '%s' does not exist" % layer_cfg_path)
- if not os.path.exists(param_cfg_path):
- raise LayerParsingError("Layer parameter file '%s' does not exist" % param_cfg_path)
- if len(layers) == 0:
- mcp = MyConfigParser(dict_type=OrderedDict)
- mcp.readfp(open(layer_cfg_path))
- for name in mcp.sections():
- if not mcp.has_option(name, 'type'):
- raise LayerParsingError("Layer '%s': no type given" % name)
- ltype = mcp.safe_get(name, 'type')
- if ltype not in layer_parsers:
- raise LayerParsingError("Layer '%s': Unknown layer type: '%s'" % (name, ltype))
- layers[name] = layer_parsers[ltype]().parse(name, mcp, layers, model)
-
- LayerParser.detach_neuron_layers(layers)
- for l in layers.values():
- l['parser'].optimize(layers)
- del l['parser']
-
- for name,l in layers.items():
- if not l['type'].startswith('cost.'):
- found = max(name in l2['inputs'] for l2 in layers.values() if 'inputs' in l2)
- if not found:
- raise LayerParsingError("Layer '%s' of type '%s' is unused" % (name, l['type']))
-
- mcp = MyConfigParser(dict_type=OrderedDict)
- mcp.readfp(open(param_cfg_path))
-# mcp.convnet = model
- for name,l in layers.items():
- if not mcp.has_section(name) and l['requiresParams']:
- raise LayerParsingError("Layer '%s' of type '%s' requires extra parameters, but none given in file '%s'." % (name, l['type'], param_cfg_path))
- lp = layer_parsers[l['type']]().init(l)
- lp.add_params(mcp)
- except LayerParsingError, e:
- print e
- sys.exit(1)
- return layers
-
- @staticmethod
- def register_layer_parser(ltype, cls):
- if ltype in layer_parsers:
- raise LayerParsingError("Layer type '%s' already registered" % ltype)
- layer_parsers[ltype] = cls
-
-# Any layer that takes an input (i.e. non-data layer)
-class LayerWithInputParser(LayerParser):
- def __init__(self, num_inputs=-1):
- LayerParser.__init__(self)
- self.num_inputs = num_inputs
-
- def verify_num_params(self, params, auto_expand=True):
- for param in params:
- if len(self.dic[param]) != len(self.dic['inputs']):
- if auto_expand and len(self.dic[param]) == 1:
- self.dic[param] *= len(self.dic['inputs'])
- else:
- raise LayerParsingError("Layer '%s': %s list length does not match number of inputs" % (self.dic['name'], param))
-
- # layers: dictionary: name -> layer
- def optimize(self, layers):
- LayerParser.optimize(self, layers)
- dic = self.dic
-
- # Check if I have an input that no one else uses.
- #print "Layer %s optimizing" % dic['name']
- if not dic['forceOwnActs']:
- for i, inp in enumerate(dic['inputLayers']):
- if inp['outputs'] == dic['outputs'] and sum(('inputs' in ll) and (inp['name'] in ll['inputs']) for ll in layers.itervalues()) == 1:
- # I can share my activity matrix with this layer
- # if it does not use its activity matrix, and I
- # do not need to remember my inputs.
- # TODO: a dropout layer should always be able to overwrite
- # its input. Make it so.
-# print "Layer %s(uses inputs=%d), input %s(uses acts = %d)" % (dic['name'], dic['usesInputs'], inp['name'], inp['usesActs'])
- if not inp['usesActs'] and not dic['usesInputs']:
- dic['actsTarget'] = i
- print "Layer %s using acts from layer %s" % (dic['name'], inp['name'])
-# print "Layer '%s' sharing activity matrix with layer '%s'" % (dic['name'], l['name'])
- # I can share my gradient matrix with this layer if we're on the same GPU.
- # This is different from the logic for actsTarget because this guy doesn't
- # have an actsGrad matrix on my GPU if our GPUs are different, so there's
- # nothing to share.
- if dic['gpu'] == inp['gpu']:
- dic['actsGradTarget'] = i
-# print "Layer '%s' sharing activity gradient matrix with layer '%s'" % (dic['name'], l['name'])
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerParser.parse(self, name, mcp, prev_layers, model)
-
- dic['inputs'] = [inp.strip() for inp in mcp.safe_get(name, 'inputs').split(',')]
-
- for inp in dic['inputs']:
- if inp not in prev_layers:
- raise LayerParsingError("Layer '%s': input layer '%s' not defined" % (name, inp))
-
- dic['inputLayers'] = [prev_layers[inp] for inp in dic['inputs']]
- dic['gpu'] = mcp.safe_get_int_list(name, 'gpu', default=dic['inputLayers'][0]['gpu'])
- dic['gpus'] = ", ".join('%s' % d for d in dic['gpu'])
- dic['numReplicas'] = len(dic['gpu'])
-
- if len(set(dic['gpu'])) != len(dic['gpu']):
- raise LayerParsingError("Layer '%s': all replicas must run on different GPUs." % (name))
-
- for inp in dic['inputs']:
- # Data layers do not explicitly define how many replicas they have.
- # The number of replicas for a data layer is given by the number of replicas
- # in the next layer(s). So we set that here.
- inpl = prev_layers[inp]
- if inpl['type'] == 'data':
- inpl['numReplicas'] = dic['numReplicas']
- if inpl['numReplicas'] % dic['numReplicas'] != 0:
- raise LayerParsingError("Layer '%s': number of replicas (%d) must divide number of replicas in all input layers (input %s has %d replicas)." % (name, dic['numReplicas'], inpl['name'], inpl['numReplicas']))
- if len(set(inp['numReplicas'] for inp in dic['inputLayers'])) != 1:
- raise LayerParsingError("Layer '%s': all input layers must have equal numbers of replicas." % (name))
-
- # Need to also assert that all *next* layers have equal number of replicas but this is hard so it's done in Layer.optimize
- for inp in dic['inputLayers']:
- if inp['outputs'] == 0:
- raise LayerParsingError("Layer '%s': input layer '%s' does not produce any output" % (name, inp['name']))
- dic['numInputs'] = [inp['outputs'] for inp in dic['inputLayers']]
-
- # Layers can declare a neuron activation function to apply to their output, as a shortcut
- # to avoid declaring a separate neuron layer above themselves.
- dic['neuron'] = mcp.safe_get(name, 'neuron', default="")
- if self.num_inputs > 0 and len(dic['numInputs']) != self.num_inputs:
- raise LayerParsingError("Layer '%s': number of inputs must be %d" % (name, self.num_inputs))
-
- if model:
- self.verify_all_ints_in(dic['gpu'], 'gpu', range(len(model.op.get_value('gpu'))))
- return dic
-
- def verify_img_size(self):
- dic = self.dic
- if dic['numInputs'][0] % dic['imgPixels'] != 0 or dic['imgSize'] * dic['imgSize'] != dic['imgPixels']:
- raise LayerParsingError("Layer '%s': has %-d dimensional input, not interpretable as %d-channel images" % (dic['name'], dic['numInputs'][0], dic['channels']))
-
- @staticmethod
- def grad_consumers_below(dic):
- if dic['gradConsumer']:
- return True
- if 'inputLayers' in dic:
- return any(LayerWithInputParser.grad_consumers_below(l) for l in dic['inputLayers'])
-
- def verify_no_grads(self):
- if LayerWithInputParser.grad_consumers_below(self.dic):
- raise LayerParsingError("Layer '%s': layers of type '%s' cannot propagate gradient and must not be placed over layers with parameters." % (self.dic['name'], self.dic['type']))
-
-class NailbedLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
-
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['stride'] = mcp.safe_get_int(name, 'stride')
-
- self.verify_num_range(dic['channels'], 'channels', 1, None)
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
- dic['outputsX'] = (dic['imgSize'] + dic['stride'] - 1) / dic['stride']
- dic['start'] = (dic['imgSize'] - dic['stride'] * (dic['outputsX'] - 1)) / 2
- dic['outputs'] = dic['channels'] * dic['outputsX']**2
-
- self.verify_num_range(dic['outputsX'], 'outputsX', 0, None)
-
- self.verify_img_size()
-
- print "Initialized bed-of-nails layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels'])
- return dic
-
-class GaussianBlurLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
- dic['outputs'] = dic['numInputs'][0]
-
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['filterSize'] = mcp.safe_get_int(name, 'filterSize')
- dic['stdev'] = mcp.safe_get_float(name, 'stdev')
-
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- self.verify_int_in(dic['filterSize'], 'filterSize', [3, 5, 7, 9])
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
- dic['filter'] = n.array([exp(-(dic['filterSize']/2 - i)**2 / float(2 * dic['stdev']**2))
- for i in xrange(dic['filterSize'])], dtype=n.float32).reshape(1, dic['filterSize'])
- dic['filter'] /= dic['filter'].sum()
- self.verify_img_size()
-
- if dic['filterSize'] > dic['imgSize']:
- raise LayerParsingError("Later '%s': filter size (%d) must be smaller than image size (%d)." % (dic['name'], dic['filterSize'], dic['imgSize']))
-
- print "Initialized Gaussian blur layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
-
- return dic
-
-class HorizontalReflectionLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['outputs'] = dic['numInputs'][0]
- dic['channels'] = mcp.safe_get_int(name, 'channels')
-
- self.verify_num_range(dic['channels'], 'channels', 1, 3)
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
- self.verify_img_size()
-
- print "Initialized horizontal reflection layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
-
- return dic
-
-class ResizeLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
-
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
-
- dic['scale'] = mcp.safe_get_float(name, 'scale')
- dic['tgtSize'] = int(floor(dic['imgSize'] / dic['scale']))
- dic['tgtPixels'] = dic['tgtSize']**2
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- # Really not recommended to use this for such severe scalings
- self.verify_float_range(dic['scale'], 'scale', 0.5, 2)
-
- dic['outputs'] = dic['channels'] * dic['tgtPixels']
-
- self.verify_img_size()
- self.verify_no_grads()
-
- print "Initialized resize layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels'])
-
- return dic
-
-class RandomScaleLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
-
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- self.verify_num_range(dic['channels'], 'channels', 1, None)
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
-
- dic['maxScale'] = mcp.safe_get_float(name, 'maxScale')
- dic['tgtSize'] = mcp.safe_get_int(name, 'tgtSize')
- min_size = int(floor(dic['imgSize'] / dic['maxScale']))
- max_size = dic['imgSize'] #int(floor(dic['imgSize'] * dic['maxScale']))
- if dic['tgtSize'] < min_size:
- raise LayerParsingError("Layer '%s': target size must be greater than minimum image size after rescaling (%d)" % (name, min_size))
- if dic['tgtSize'] > max_size:
- raise LayerParsingError("Layer '%s': target size must be smaller than maximum image size after rescaling (%d)" % (name, max_size))
- dic['tgtPixels'] = dic['tgtSize']**2
-
- self.verify_float_range(dic['maxScale'], 'maxScale', 1, 2)
-
- dic['outputs'] = dic['channels'] * dic['tgtPixels']
-
- self.verify_img_size()
- self.verify_no_grads()
-
- print "Initialized random scale layer '%s', producing %dx%d %d-channel output" % (name, dic['tgtSize'], dic['tgtSize'], dic['channels'])
-
- return dic
-
-class CropLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
-
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- dic['startX'] = mcp.safe_get_int(name, 'startX')
- dic['startY'] = mcp.safe_get_int(name, 'startY', default=dic['startX'])
- dic['sizeX'] = mcp.safe_get_int(name, 'sizeX')
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
-
- dic['outputs'] = dic['channels'] * (dic['sizeX']**2)
-
- self.verify_num_range(dic['startX'], 'startX', 0, dic['imgSize']-1)
- self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize'])
- self.verify_num_range(dic['startY'], 'startY', 0, dic['imgSize']-1)
- self.verify_img_size()
- self.verify_no_grads()
-
- if dic['startX'] + dic['sizeX'] > dic['imgSize']:
- raise LayerParsingError("Layer '%s': startX (%d) + sizeX (%d) > imgSize (%d)" % (name, dic['startX'], dic['sizeX'], dic['imgSize']))
-
- print "Initialized cropping layer '%s', producing %dx%d %d-channel output" % (name, dic['sizeX'], dic['sizeX'], dic['channels'])
-
- return dic
-
-class ColorTransformLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['forceOwnActs'] = False
- dic['usesActs'] = False
- dic['usesInputs'] = False
-
- # Computed values
- dic['imgPixels'] = dic['numInputs'][0] / 3
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
- dic['channels'] = 3
- dic['outputs'] = dic['numInputs'][0]
-
- self.verify_img_size()
- self.verify_no_grads()
-
- return dic
-
-class RGBToYUVLayerParser(ColorTransformLayerParser):
- def __init__(self):
- ColorTransformLayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model)
- print "Initialized RGB --> YUV layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
- return dic
-
-class RGBToLABLayerParser(ColorTransformLayerParser):
- def __init__(self):
- ColorTransformLayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = ColorTransformLayerParser.parse(self, name, mcp, prev_layers, model)
- dic['center'] = mcp.safe_get_bool(name, 'center', default=False)
- print "Initialized RGB --> LAB layer '%s', producing %dx%d %d-channel output" % (name, dic['imgSize'], dic['imgSize'], dic['channels'])
- return dic
-
-class NeuronLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- @staticmethod
- def get_unused_layer_name(layers, wish):
- if wish not in layers:
- return wish
- for i in xrange(1, 100):
- name = '%s.%d' % (wish, i)
- if name not in layers:
- return name
- raise LayerParsingError("This is insane.")
-
- def parse_neuron(self, neuron_str):
- for n in neuron_parsers:
- p = n.parse(neuron_str)
- if p: # Successfully parsed neuron, return it
- self.dic['neuron'] = p
- self.dic['usesActs'] = self.dic['neuron']['usesActs']
- self.dic['usesInputs'] = self.dic['neuron']['usesInputs']
-
- return
- # Could not parse neuron
- # Print available neuron types
- colnames = ['Neuron type', 'Function']
- m = max(len(colnames[0]), OptionsParser._longest_value(neuron_parsers, key=lambda x:x.type)) + 2
- ntypes = [OptionsParser._bold(colnames[0].ljust(m))] + [n.type.ljust(m) for n in neuron_parsers]
- fnames = [OptionsParser._bold(colnames[1])] + [n.func_str for n in neuron_parsers]
- usage_lines = NL.join(ntype + fname for ntype,fname in zip(ntypes, fnames))
-
- raise LayerParsingError("Layer '%s': unable to parse neuron type '%s'. Valid neuron types: %sWhere neurons have parameters, they must be floats." % (self.dic['name'], neuron_str, NL + usage_lines + NL))
-
- def detach_neuron_layer(self, src_name, layers):
- dic = self.dic
-# self.set_defaults()
- dic['name'] = NeuronLayerParser.get_unused_layer_name(layers, '%s_neuron' % src_name)
- dic['type'] = 'neuron'
- dic['inputs'] = src_name
- dic['neuron'] = layers[src_name]['neuron']
- dic['gpu'] = layers[src_name]['gpu']
-
- # Yes it's not entirely correct to pass all of layers as prev_layers, but it's harmless
- dic = self.parse(dic['name'], FakeConfigParser(dic), layers)
- dic['src_layer'] = src_name
-
- # Link upper layers to this new one
- for l in layers.values():
- if 'inputs' in l:
- l['inputs'] = [inp if inp != src_name else dic['name'] for inp in l['inputs']]
- l['inputLayers'] = [inp if inp['name'] != src_name else dic for inp in l['inputLayers']]
- layers[dic['name']] = dic
-
- def parse(self, name, mcp, prev_layers, model=None):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['outputs'] = dic['numInputs'][0]
- self.parse_neuron(dic['neuron'])
- dic['forceOwnActs'] = False
- print "Initialized neuron layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class EltwiseSumLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self)
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['coeffs'] = mcp.safe_get_float_list(name, 'coeffs', default=[1.0] * len(dic['inputs']))
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
-
- if len(set(dic['numInputs'])) != 1:
- raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs'])))
- dic['outputs'] = dic['numInputs'][0]
- dic['usesInputs'] = False
- dic['usesActs'] = False
- dic['forceOwnActs'] = False
- dic['requiresParams'] = True
-
- print "Initialized elementwise sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class EltwiseMaxLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- if len(dic['inputs']) < 2:
- raise LayerParsingError("Layer '%s': elementwise max layer must have at least 2 inputs, got %d." % (name, len(dic['inputs'])))
- if len(set(dic['numInputs'])) != 1:
- raise LayerParsingError("Layer '%s': all inputs must have the same dimensionality. Got dimensionalities: %s" % (name, ", ".join(str(s) for s in dic['numInputs'])))
- dic['outputs'] = dic['numInputs'][0]
-
- print "Initialized elementwise max layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class SumLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
-
- dic['stride'] = mcp.safe_get_int(name, 'stride', default=1)
- self.verify_divisible(dic['numInputs'][0], dic['stride'], 'input dimensionality', 'stride')
- dic['outputs'] = dic['numInputs'][0] / dic['stride']
-
- print "Initialized sum layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class DropoutLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['enable'] = mcp.safe_get_bool(name, 'enable', default=True)
- dic['keep'] = mcp.safe_get_float(name, 'keep', default=0.5)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- dic['usesInputs'] = False
- dic['usesActs'] = False
- dic['forceOwnActs'] = False
- dic['outputs'] = dic['numInputs'][0]
-
- print "Initialized %s layer '%s' on GPUs %s, producing %d outputs" % (dic['type'], name, dic['gpus'], dic['outputs'])
- return dic
-
-class Dropout2LayerParser(DropoutLayerParser):
- def __init__(self):
- DropoutLayerParser.__init__(self)
-
-class WeightLayerParser(LayerWithInputParser):
- LAYER_PAT = re.compile(r'^\s*([^\s\[]+)(?:\[(\d+)\])?\s*$') # matches things like layername[5], etc
-
- def __init__(self, num_inputs=-1):
- LayerWithInputParser.__init__(self, num_inputs=num_inputs)
-
- @staticmethod
- def get_layer_name(name_str):
- m = WeightLayerParser.LAYER_PAT.match(name_str)
- if not m:
- return None
- return m.group(1), m.group(2)
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['momW'] = mcp.safe_get_float_list(name, 'momW')
- dic['momB'] = mcp.safe_get_float(name, 'momB')
- dic['superEps'] = mcp.safe_get_float(name, 'superEps', default=0.0)
- dic['superMom'] = mcp.safe_get_float(name, 'superMom', default=0.0)
- dic['wc'] = mcp.safe_get_float_list(name, 'wc', default=[0.0] * len(dic['inputs']))
- dic['wball'] = mcp.safe_get_float_list(name, 'wball', default=[0.0] * len(dic['inputs']))
- self.verify_num_params(['momW', 'wc', 'wball'])
-# dic['wballNormed'] = [wball * nweights for wball,nweights in zip(dic['wball'], dic['weightsPerFilter'])]
- dic['wballNormed'] = dic['wball']
-
- # Convert from old-style 0.001,0.02 hyperparam specification to new-stye
- # const[base=0.001],const[base=0.02] and so forth
- def convert_scalars_to_schedules(scalars):
- parts = scalars.split(',')
- for i,p in enumerate(parts):
- p = p.strip()
- if re.match('(?:\d*\.)?\d+$', p):
- parts[i] = 'const[base=%s]' % p
- return parts
-
- dic['epsW'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsW')), lrs_parsers, 'epsW', 'learning rate schedule', num_params=len(dic['inputs']))
- dic['epsB'] = self.parse_params(convert_scalars_to_schedules(mcp.safe_get(name, 'epsB')), lrs_parsers, 'epsB', 'learning rate schedule', num_params=1)[0]
-
- dic['updatePeriod'] = mcp.safe_get_int(name, 'updatePeriod', default=0) # 0 means update as often as possible
- # TODO: assert that updatePeriod is a multiple of active pass period, which is unknown here.
- # the assert has to go in some post-processing step..
- dic['gradConsumer'] = dic['epsB']['params']['base'] > 0 or any(w['params']['base'] > 0 for w in dic['epsW'])
-
- @staticmethod
- def unshare_weights(layer, layers, matrix_idx=None):
- def unshare(layer, layers, indices):
- for i in indices:
- if layer['weightSourceLayers'][i] >= 0:
- src_matrix_idx = layer['weightSourceMatrixIndices'][i]
- layer['weightSourceLayers'][i] = ""
- layer['weightSourceMatrixIndices'][i] = -1
- layer['weights'][i] = layer['weights'][i].copy()
- layer['weightsInc'][i] = n.zeros_like(layer['weights'][i])
- print "Unshared weight matrix %s[%d] from %s[%d]." % (layer['name'], i, layer['weightSourceLayers'][i], src_matrix_idx)
- else:
- print "Weight matrix %s[%d] already unshared." % (layer['name'], i)
- if 'weightSourceLayers' in layer:
- unshare(layer, layers, range(len(layer['inputs'])) if matrix_idx is None else [matrix_idx])
-
- # Load weight/biases initialization module
- def call_init_func(self, param_name, shapes, input_idx=-1):
- dic = self.dic
- func_pat = re.compile('^([^\.]+)\.([^\(\)]+)\s*(?:\(([^,]+(?:,[^,]+)*)\))?$')
- m = func_pat.match(dic[param_name])
- if not m:
- raise LayerParsingError("Layer '%s': '%s' parameter must have format 'moduleName.functionName(param1,param2,...)'; got: %s." % (dic['name'], param_name, dic['initWFunc']))
- module, func = m.group(1), m.group(2)
- params = m.group(3).split(',') if m.group(3) is not None else []
- try:
- mod = __import__(module)
- return getattr(mod, func)(dic['name'], input_idx, shapes, params=params) if input_idx >= 0 else getattr(mod, func)(dic['name'], shapes, params=params)
- except (ImportError, AttributeError, TypeError), e:
- raise LayerParsingError("Layer '%s': %s." % (dic['name'], e))
-
- def make_weights(self, initW, rows, cols, order='C'):
- dic = self.dic
- dic['weights'], dic['weightsInc'] = [], []
- if dic['initWFunc']: # Initialize weights from user-supplied python function
- # Initialization function is supplied in the format
- # module.func
- for i in xrange(len(dic['inputs'])):
- dic['weights'] += [self.call_init_func('initWFunc', (rows[i], cols[i]), input_idx=i)]
-
- if type(dic['weights'][i]) != n.ndarray:
- raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], i, dic['initWFunc'], type(dic['weights'][i])))
- if dic['weights'][i].dtype != n.float32:
- raise LayerParsingError("Layer '%s[%d]': weight initialization function %s must weight matrices consisting of single-precision floats. Got: %s." % (dic['name'], i, dic['initWFunc'], dic['weights'][i].dtype))
- if dic['weights'][i].shape != (rows[i], cols[i]):
- raise LayerParsingError("Layer '%s[%d]': weight matrix returned by weight initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], i, dic['initWFunc'], (rows[i], cols[i]), dic['weights'][i].shape))
- # Convert to desired order
- dic['weights'][i] = n.require(dic['weights'][i], requirements=order)
- dic['weightsInc'] += [n.zeros_like(dic['weights'][i])]
- print "Layer '%s[%d]' initialized weight matrices from function %s" % (dic['name'], i, dic['initWFunc'])
- else:
- for i in xrange(len(dic['inputs'])):
- if dic['weightSourceLayers'][i] != '': # Shared weight matrix
- src_layer = self.prev_layers[dic['weightSourceLayers'][i]] if dic['weightSourceLayers'][i] != dic['name'] else dic
- dic['weights'] += [src_layer['weights'][dic['weightSourceMatrixIndices'][i]]]
- dic['weightsInc'] += [src_layer['weightsInc'][dic['weightSourceMatrixIndices'][i]]]
- if dic['weights'][i].shape != (rows[i], cols[i]):
- raise LayerParsingError("Layer '%s': weight sharing source matrix '%s' has shape %dx%d; should be %dx%d."
- % (dic['name'], dic['weightSource'][i], dic['weights'][i].shape[0], dic['weights'][i].shape[1], rows[i], cols[i]))
- print "Layer '%s' initialized weight matrix %d from %s" % (dic['name'], i, dic['weightSource'][i])
- else:
- dic['weights'] += [n.array(initW[i] * nr.randn(rows[i], cols[i]), dtype=n.single, order=order)]
- dic['weightsInc'] += [n.zeros_like(dic['weights'][i])]
-
- def make_biases(self, rows, cols, order='C'):
- dic = self.dic
- if dic['initBFunc']:
- dic['biases'] = self.call_init_func('initBFunc', (rows, cols))
- if type(dic['biases']) != n.ndarray:
- raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object. Got: %s." % (dic['name'], dic['initBFunc'], type(dic['biases'])))
- if dic['biases'].dtype != n.float32:
- raise LayerParsingError("Layer '%s': bias initialization function %s must return numpy.ndarray object consisting of single-precision floats. Got: %s." % (dic['name'], dic['initBFunc'], dic['biases'].dtype))
- if dic['biases'].shape != (rows, cols):
- raise LayerParsingError("Layer '%s': bias vector returned by bias initialization function %s has wrong shape. Should be: %s; got: %s." % (dic['name'], dic['initBFunc'], (rows, cols), dic['biases'].shape))
-
- dic['biases'] = n.require(dic['biases'], requirements=order)
- print "Layer '%s' initialized bias vector from function %s" % (dic['name'], dic['initBFunc'])
- else:
- dic['biases'] = dic['initB'] * n.ones((rows, cols), order=order, dtype=n.single)
- dic['biasesInc'] = n.zeros_like(dic['biases'])
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- dic['gradConsumer'] = True
- dic['usesActs'] = False
- dic['initW'] = mcp.safe_get_float_list(name, 'initW', default=0.01)
- dic['initB'] = mcp.safe_get_float(name, 'initB', default=0)
- dic['initWFunc'] = mcp.safe_get(name, 'initWFunc', default="")
- dic['initBFunc'] = mcp.safe_get(name, 'initBFunc', default="")
- # Find shared weight matrices
-
- dic['weightSource'] = mcp.safe_get_list(name, 'weightSource', default=[''] * len(dic['inputs']))
- self.verify_num_params(['initW'])
- self.verify_num_params(['weightSource'], auto_expand=False)
-
- dic['weightSourceLayers'] = []
- dic['weightSourceMatrixIndices'] = []
-
- for i, src_name in enumerate(dic['weightSource']):
- src_layer_matrix_idx = -1
- src_layer_name = ''
- if src_name != '':
- src_layer_match = WeightLayerParser.get_layer_name(src_name)
- if src_layer_match is None:
- raise LayerParsingError("Layer '%s': unable to parse weight sharing source '%s'. Format is layer[idx] or just layer, in which case idx=0 is used." % (name, src_name))
- src_layer_name = src_layer_match[0]
- src_layer_matrix_idx = int(src_layer_match[1]) if src_layer_match[1] is not None else 0
-
- if src_layer_name not in prev_layers and src_layer_name != name:
- raise LayerParsingError("Layer '%s': weight sharing source layer '%s' does not exist." % (name, src_layer_name))
-
-# src_layer_idx = prev_names.index(src_layer_name) if src_layer_name != name else len(prev_names)
- src_layer = prev_layers[src_layer_name] if src_layer_name != name else dic
- if src_layer['gpu'] != dic['gpu']:
- raise LayerParsingError("Layer '%s': weight sharing source layer '%s' runs on GPUs %s, while '%s' runs on GPUs %s." % (name, src_layer_name, src_layer['gpu'], name, dic['gpu']))
- if src_layer['type'] != dic['type']:
- raise LayerParsingError("Layer '%s': weight sharing source layer '%s' is of type '%s'; should be '%s'." % (name, src_layer_name, src_layer['type'], dic['type']))
- if src_layer_name != name and len(src_layer['weights']) <= src_layer_matrix_idx:
- raise LayerParsingError("Layer '%s': weight sharing source layer '%s' has %d weight matrices, but '%s[%d]' requested." % (name, src_layer_name, len(src_layer['weights']), src_name, src_layer_matrix_idx))
- if src_layer_name == name and src_layer_matrix_idx >= i:
- raise LayerParsingError("Layer '%s': weight sharing source '%s[%d]' not defined yet." % (name, name, src_layer_matrix_idx))
-
- dic['weightSourceLayers'] += [src_layer_name]
- dic['weightSourceMatrixIndices'] += [src_layer_matrix_idx]
-
- return dic
-
-class FCLayerParser(WeightLayerParser):
- def __init__(self):
- WeightLayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
-
- dic['outputs'] = mcp.safe_get_int(name, 'outputs')
- dic['weightsPerFilter'] = dic['numInputs']
- self.verify_num_range(dic['outputs'], 'outputs', 1, None)
- self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']] * len(dic['numInputs']), order='F')
- self.make_biases(1, dic['outputs'], order='F')
-
- print "Initialized fully-connected layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class SplitFCLayerParser(WeightLayerParser):
- def __init__(self):
- WeightLayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
- dic['parts'] = mcp.safe_get_int(name, 'parts')
- dic['outputs'] = mcp.safe_get_int(name, 'outputs') * dic['parts']
- dic['weightsPerFilter'] = dic['numInputs']
- self.verify_num_range(dic['parts'], 'parts', 1, None)
-
- self.make_weights(dic['initW'], dic['numInputs'], [dic['outputs']/dic['parts']] * len(dic['numInputs']), order='F')
- self.make_biases(1, dic['outputs'], order='F')
-
- for i in xrange(len(dic['numInputs'])):
- self.verify_divisible(dic['numInputs'][i], dic['parts'], 'numInputs', 'parts', input_idx=i)
-
- print "Initialized split fully-connected layer '%s' on GPUs %s, producing %d outputs in %d parts" % (name, dic['gpus'], dic['outputs'], dic['parts'])
- return dic
-
-class LocalLayerParser(WeightLayerParser):
- def __init__(self):
- WeightLayerParser.__init__(self)
-
- # Convert convolutional layer to unshared, locally-connected layer
- @staticmethod
- def conv_to_local(layers, lname):
- layer = layers[lname]
- if layer['type'] == 'conv':
- layer['type'] = 'local'
- for inp,inpname in enumerate(layer['inputs']):
- src_layer_name = layer['weightSourceLayers'][inp]
- if src_layer_name != '':
- src_layer = layers[src_layer_name]
- src_matrix_idx = layer['weightSourceMatrixIndices'][inp]
- LocalLayerParser.conv_to_local(layers, src_layer_name)
- for w in ('weights', 'weightsInc'):
- layer[w][inp] = src_layer[w][src_matrix_idx]
- else:
- layer['weights'][inp] = n.require(n.reshape(n.tile(n.reshape(layer['weights'][inp], (1, n.prod(layer['weights'][inp].shape))), (layer['modules'], 1)),
- (layer['modules'] * layer['filterChannels'][inp] * layer['filterPixels'][inp], layer['filters'])),
- requirements='C')
- layer['weightsInc'][inp] = n.zeros_like(layer['weights'][inp])
- if layer['sharedBiases']:
- layer['biases'] = n.require(n.repeat(layer['biases'], layer['modules'], axis=0), requirements='C')
- layer['biasesInc'] = n.zeros_like(layer['biases'])
-
- print "Converted layer '%s' from convolutional to unshared, locally-connected" % layer['name']
-
- # Also call this function on any layers sharing my weights
- for l in layers:
- if 'weightSourceLayers' in l and lname in l['weightSourceLayers']:
- LocalLayerParser.conv_to_local(layers, l)
- return layer
-
- def parse(self, name, mcp, prev_layers, model):
- dic = WeightLayerParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- dic['usesActs'] = False
- # Supplied values
- dic['channels'] = mcp.safe_get_int_list(name, 'channels')
- dic['padding'] = mcp.safe_get_int_list(name, 'padding', default=[0]*len(dic['inputs']))
- dic['stride'] = mcp.safe_get_int_list(name, 'stride', default=[1]*len(dic['inputs']))
- dic['filterSize'] = mcp.safe_get_int_list(name, 'filterSize')
- dic['filters'] = mcp.safe_get_int_list(name, 'filters')
- dic['groups'] = mcp.safe_get_int_list(name, 'groups', default=[1]*len(dic['inputs']))
- dic['initW'] = mcp.safe_get_float_list(name, 'initW')
- dic['initCFunc'] = mcp.safe_get(name, 'initCFunc', default='')
- dic['modulesX'] = mcp.safe_get_int(name, 'modulesX', default=0)
-
-
- self.verify_num_params(['channels', 'padding', 'stride', 'filterSize', \
- 'filters', 'groups', 'initW'])
-
- self.verify_num_range(dic['stride'], 'stride', 1, None)
- self.verify_num_range(dic['filterSize'],'filterSize', 1, None)
- self.verify_num_range(dic['padding'], 'padding', 0, None)
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- self.verify_num_range(dic['groups'], 'groups', 1, None)
- self.verify_num_range(dic['modulesX'], 'modulesX', 0, None)
- for i in xrange(len(dic['filters'])):
- self.verify_divisible(dic['filters'][i], 16, 'filters', input_idx=i)
-
- # Computed values
- dic['imgPixels'] = [numInputs/channels for numInputs,channels in zip(dic['numInputs'], dic['channels'])]
- dic['imgSize'] = [int(n.sqrt(imgPixels)) for imgPixels in dic['imgPixels']]
- self.verify_num_range(dic['imgSize'], 'imgSize', 1, None)
- dic['filters'] = [filters*groups for filters,groups in zip(dic['filters'], dic['groups'])]
- dic['filterPixels'] = [filterSize**2 for filterSize in dic['filterSize']]
- if dic['modulesX'] <= 0:
- dic['modulesX'] = [1 + int(ceil((2*padding + imgSize - filterSize) / float(stride))) for padding,imgSize,filterSize,stride in zip(dic['padding'], dic['imgSize'], dic['filterSize'], dic['stride'])]
- else:
- dic['modulesX'] = [dic['modulesX']] * len(dic['inputs'])
-
- dic['filterChannels'] = [channels/groups for channels,groups in zip(dic['channels'], dic['groups'])]
-
- if len(set(dic['modulesX'])) != 1 or len(set(dic['filters'])) != 1:
- raise LayerParsingError("Layer '%s': all inputs must produce equally-dimensioned output. Dimensions are: %s." % (name, ", ".join("%dx%dx%d" % (filters, modulesX, modulesX) for filters,modulesX in zip(dic['filters'], dic['modulesX']))))
-
- dic['modulesX'] = dic['modulesX'][0]
- dic['modules'] = dic['modulesX']**2
- dic['filters'] = dic['filters'][0]
- dic['outputs'] = dic['modules'] * dic['filters']
-# dic['filterConns'] = [[]] * len(dic['inputs'])
- for i in xrange(len(dic['inputs'])):
- if dic['numInputs'][i] % dic['imgPixels'][i] != 0 or dic['imgSize'][i] * dic['imgSize'][i] != dic['imgPixels'][i]:
- raise LayerParsingError("Layer '%s[%d]': has %-d dimensional input, not interpretable as square %d-channel images" % (name, i, dic['numInputs'][i], dic['channels'][i]))
- if dic['channels'][i] > 3 and dic['channels'][i] % 4 != 0:
- raise LayerParsingError("Layer '%s[%d]': number of channels must be smaller than 4 or divisible by 4" % (name, i))
-# if dic['filterSize'][i] > totalPadding[i] + dic['imgSize'][i]:
-# raise LayerParsingError("Layer '%s[%d]': filter size (%d) greater than image size + padding (%d)" % (name, i, dic['filterSize'][i], dic['padding'][i] + dic['imgSize'][i]))
- if -dic['padding'][i] + dic['stride'][i] * (dic['modulesX'] - 1) + dic['filterSize'][i] < dic['imgSize'][i]:
- raise LayerParsingError("Layer '%s[%d]': %dx%d output map with padding=%d, stride=%d does not cover entire input image." % (name, i, dic['modulesX'], dic['outputsX'], dic['padding'][i], dic['stride'][i]))
-
- if dic['groups'][i] > 1:
- self.verify_divisible(dic['channels'][i], 4*dic['groups'][i], 'channels', '4 * groups', input_idx=i)
- self.verify_divisible(dic['channels'][i], dic['groups'][i], 'channels', 'groups', input_idx=i)
-
- self.verify_divisible(dic['filters'], 16*dic['groups'][i], 'filters * groups', input_idx=i)
-
-
- dic['padding'][i] = -dic['padding'][i]
-# dic['overSample'] = [groups*filterChannels/channels for groups,filterChannels,channels in zip(dic['groups'], dic['filterChannels'], dic['channels'])]
- dic['weightsPerFilter'] = [fc * (fz**2) for fc, fz in zip(dic['filterChannels'], dic['filterSize'])]
-
- return dic
-
-class ConvLayerParser(LocalLayerParser):
- def __init__(self):
- LocalLayerParser.__init__(self)
-
- def add_params(self, mcp):
- LocalLayerParser.add_params(self, mcp)
- self.dic['wcNormMax'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMax', default=[0.0] * len(self.dic['inputs']))
- self.dic['wcNormMin'] = mcp.safe_get_float_list(self.dic['name'], 'wcNormMin', default=[0.0] * len(self.dic['inputs']))
- self.verify_num_params(['wcNormMax', 'wcNormMin'])
- for min,max in zip(self.dic['wcNormMin'], self.dic['wcNormMax']):
- if min > max:
- raise LayerParsingError("Layer '%s': wcNormMin must be <= wcNormMax." % (self.dic['name']))
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model)
-
- dic['sumWidth'] = mcp.safe_get_int(name, 'sumWidth')
- dic['sharedBiases'] = mcp.safe_get_bool(name, 'sharedBiases', default=True)
-
- num_biases = dic['filters'] if dic['sharedBiases'] else dic['modules']*dic['filters']
-
- eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)]
- self.make_weights(dic['initW'], eltmult(dic['filterPixels'], dic['filterChannels']), [dic['filters']] * len(dic['inputs']), order='C')
- self.make_biases(num_biases, 1, order='C')
-
- print "Initialized convolutional layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters'])
- return dic
-
-class LocalUnsharedLayerParser(LocalLayerParser):
- def __init__(self):
- LocalLayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LocalLayerParser.parse(self, name, mcp, prev_layers, model)
- eltmult = lambda list1, list2: [l1 * l2 for l1,l2 in zip(list1, list2)]
- scmult = lambda x, lst: [x * l for l in lst]
- self.make_weights(dic['initW'], scmult(dic['modules'], eltmult(dic['filterPixels'], dic['filterChannels'])), [dic['filters']] * len(dic['inputs']), order='C')
- self.make_biases(dic['modules'] * dic['filters'], 1, order='C')
-
- print "Initialized locally-connected layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (name, dic['gpus'], dic['modulesX'], dic['modulesX'], dic['filters'])
- return dic
-
-class DataLayerParser(LayerParser):
- def __init__(self):
- LayerParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerParser.parse(self, name, mcp, prev_layers, model)
- dic['dataIdx'] = mcp.safe_get_int(name, 'dataIdx')
- dic['start'] = mcp.safe_get_int(name, 'start', default=0)
- dic['end'] = mcp.safe_get_int(name, 'end', default=model.train_data_provider.get_data_dims(idx=dic['dataIdx']))
- dic['outputs'] = dic['end'] - dic['start']
-# dic['usesActs'] = False
- print "Initialized data layer '%s', producing %d outputs" % (name, dic['outputs'])
- return dic
-
-class SoftmaxLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['outputs'] = dic['inputLayers'][0]['outputs']
- print "Initialized softmax layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class ConcatentionLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers'])
- dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))]
- print "Initialized concatenation layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class PassThroughLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self)
-
- # Note: this doesn't verify all the necessary constraints. Layer construction may still fail in C++ code.
- # For example, it does not verify that every layer only has one pass-through parent. Obviously having
- # two such parents is incoherent.
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
-# if len(dic['inputLayers']) == 1:
-# raise LayerParsingError("Layer %s: pass-through layer must have more than one input." % dic['name'])
- if len(dic['gpu']) != len(dic['inputLayers'][0]['gpu']):
- raise LayerParsingError("Layer '%s': number of replicas in pass-through layer must be equivalent to number of replicas in input layers." % dic['name'])
- for inp in dic['inputLayers']:
- conflicting_layers = [l for l in prev_layers.values() if l['type'] == 'pass' and inp['name'] in l['inputs'] and len(set(dic['gpu']).intersection(set(l['gpu']))) > 0]
- if len(conflicting_layers) > 0:
- raise LayerParsingError("Layer '%s' conflicts with layer '%s'. Both pass-through layers take layer '%s' as input and operate on an overlapping set of GPUs." % (dic['name'], conflicting_layers[0]['name'], inp['name']))
- dic['outputs'] = sum(l['outputs'] for l in dic['inputLayers'])
-# dic['copyOffsets'] = [sum(dic['inputLayers'][j]['outputs'] for j in xrange(i)) for i in xrange(len(dic['inputLayers']))]
- print "Initialized pass-through layer '%s' on GPUs %s, producing %d outputs" % (name, dic['gpus'], dic['outputs'])
- return dic
-
-class PoolLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['sizeX'] = mcp.safe_get_int(name, 'sizeX')
- dic['start'] = mcp.safe_get_int(name, 'start', default=0)
- dic['stride'] = mcp.safe_get_int(name, 'stride')
- dic['outputsX'] = mcp.safe_get_int(name, 'outputsX', default=0)
- dic['pool'] = mcp.safe_get(name, 'pool')
-
- # Avg pooler does not use its acts or inputs
- dic['usesActs'] = dic['pool'] != 'avg'
- dic['usesInputs'] = dic['pool'] != 'avg'
-
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
-
- if dic['pool'] == 'avg':
- dic['sum'] = mcp.safe_get_bool(name, 'sum', default=False)
-
- self.verify_num_range(dic['sizeX'], 'sizeX', 1, dic['imgSize'])
- self.verify_num_range(dic['stride'], 'stride', 1, dic['sizeX'])
- self.verify_num_range(dic['outputsX'], 'outputsX', 0, None)
- self.verify_num_range(dic['channels'], 'channels', 1, None)
-
- if LayerWithInputParser.grad_consumers_below(dic):
- self.verify_divisible(dic['channels'], 16, 'channels')
- self.verify_str_in(dic['pool'], 'pool', ['max', 'maxabs', 'avg'])
-
- self.verify_img_size()
-
- if dic['outputsX'] <= 0:
- dic['outputsX'] = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1;
- dic['outputs'] = dic['outputsX']**2 * dic['channels']
-
- print "Initialized %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['outputsX'], dic['outputsX'], dic['channels'])
- return dic
-
-
-class CrossMapPoolLayerParser(LayerWithInputParser):
- def __init__(self):
- LayerWithInputParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['size'] = mcp.safe_get_int(name, 'size')
- dic['start'] = mcp.safe_get_int(name, 'start', default=0)
- dic['stride'] = mcp.safe_get_int(name, 'stride')
- dic['outputChannels'] = mcp.safe_get_int(name, 'outputs', default=0)
- dic['pool'] = mcp.safe_get(name, 'pool')
- dic['requiresParams'] = False
-
- # Avg pooler does not use its acts or inputs
- dic['usesActs'] = 'pool' != 'avg'
- dic['usesInputs'] = 'pool' != 'avg'
-
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
- dic['outputs'] = dic['outputChannels'] * dic['imgPixels']
-
- self.verify_num_range(dic['size'], 'size', 1, dic['channels'])
- self.verify_num_range(dic['stride'], 'stride', 1, dic['size'])
- self.verify_num_range(dic['outputChannels'], 'outputChannels', 0, None)
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- self.verify_num_range(dic['start'], 'start', None, 0)
-
- self.verify_str_in(dic['pool'], 'pool', ['max'])
- self.verify_img_size()
-
- covered_chans = dic['start'] + (dic['outputChannels'] - 1) * dic['stride'] + dic['size']
- if covered_chans < dic['channels']:
- raise LayerParsingError("Layer '%s': cross-map pooling with start=%d, stride=%d, size=%d, outputs=%d covers only %d of %d input channels." % \
- (name, dic['start'], dic['stride'], dic['size'], dic['outputChannels'], covered_chans, dic['channels']))
-
- print "Initialized cross-map %s-pooling layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (dic['pool'], name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['outputChannels'])
- return dic
-
-class NormLayerParser(LayerWithInputParser):
- RESPONSE_NORM = 'response'
- CONTRAST_NORM = 'contrast'
- CROSSMAP_RESPONSE_NORM = 'cross-map response'
-
- def __init__(self, norm_type):
- LayerWithInputParser.__init__(self, num_inputs=1)
- self.norm_type = norm_type
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['scale'] = mcp.safe_get_float(name, 'scale')
- dic['scale'] /= dic['size'] if self.norm_type == self.CROSSMAP_RESPONSE_NORM else dic['size']**2
- dic['pow'] = mcp.safe_get_float(name, 'pow')
- dic['minDiv'] = mcp.safe_get_float(name, 'minDiv', default=1.0)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- dic['channels'] = mcp.safe_get_int(name, 'channels')
- dic['size'] = mcp.safe_get_int(name, 'size')
- dic['blocked'] = mcp.safe_get_bool(name, 'blocked', default=False)
-
- dic['imgPixels'] = dic['numInputs'][0] / dic['channels']
- dic['imgSize'] = int(n.sqrt(dic['imgPixels']))
-
- # Contrast normalization layer does not use its inputs
- dic['usesInputs'] = self.norm_type != self.CONTRAST_NORM
-
- self.verify_num_range(dic['channels'], 'channels', 1, None)
- if self.norm_type == self.CROSSMAP_RESPONSE_NORM:
- self.verify_num_range(dic['size'], 'size', 2, dic['channels'])
- if dic['channels'] % 16 != 0:
- raise LayerParsingError("Layer '%s': number of channels must be divisible by 16 when using crossMap" % name)
- else:
- self.verify_num_range(dic['size'], 'size', 1, dic['imgSize'])
-
- if self.norm_type != self.CROSSMAP_RESPONSE_NORM and dic['channels'] > 3 and dic['channels'] % 4 != 0:
- raise LayerParsingError("Layer '%s': number of channels must be smaller than 4 or divisible by 4" % name)
-
- self.verify_img_size()
-
- dic['outputs'] = dic['imgPixels'] * dic['channels']
- print "Initialized %s-normalization layer '%s' on GPUs %s, producing %dx%d %d-channel output" % (self.norm_type, name, dic['gpus'], dic['imgSize'], dic['imgSize'], dic['channels'])
- return dic
-
-class CostParser(LayerWithInputParser):
- def __init__(self, num_inputs=-1):
- LayerWithInputParser.__init__(self, num_inputs=num_inputs)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = LayerWithInputParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- # Stored as string because python can't pickle lambda functions
- dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs]'
- dic['children'] = mcp.safe_get_list(name, 'children', default=[])
- # Aggregated costs only produce outputs which are additive.
- for c in dic['children']:
- if c not in prev_layers:
- raise LayerParsingError("Layer '%s': child cost layer '%s' not defined" % (name, c))
- if prev_layers[c]['type'] != dic['type']:
- raise LayerParsingError("Layer '%s': child cost layer '%s' must have same type as parent" % (name, c))
- prev_layers[c]['aggregated'] = 1
- dic['aggregated'] = dic['children'] != []
- del dic['neuron']
- return dic
-
- def add_params(self, mcp):
- LayerWithInputParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['coeff'] = mcp.safe_get_float(name, 'coeff')
- dic['gradConsumer'] = dic['coeff'] > 0
-
-class CrossEntCostParser(CostParser):
- def __init__(self):
- CostParser.__init__(self, num_inputs=2)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = CostParser.parse(self, name, mcp, prev_layers, model)
- if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels
- raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name)
- if dic['inputLayers'][1]['type'] != 'softmax':
- raise LayerParsingError("Layer '%s': Second input must be softmax layer" % name)
- if dic['numInputs'][1] != model.train_data_provider.get_num_classes():
- raise LayerParsingError("Layer '%s': Softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \
- % (name, dic['inputs'][1], model.train_data_provider.get_num_classes()))
-
- print "Initialized cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
- return dic
-
-class LogregCostParser(CostParser):
- def __init__(self):
- CostParser.__init__(self, num_inputs=2)
-
- def add_params(self, mcp):
- CostParser.add_params(self, mcp)
- dic, name = self.dic, self.dic['name']
- dic['topk'] = mcp.safe_get_int(name, 'topk', default=1)
- if dic['topk'] > dic['numInputs'][1]:
- raise LayerParsingError("Layer '%s': parameter 'topk'must not have value greater than the number of classess." % (name))
-
- def parse(self, name, mcp, prev_layers, model):
- dic = CostParser.parse(self, name, mcp, prev_layers, model)
- dic['requiresParams'] = True
- if dic['numInputs'][0] != 1: # first input must be labels
- raise LayerParsingError("Layer '%s': dimensionality of first input must be 1" % name)
- if dic['inputLayers'][1]['type'] != 'softmax':
- raise LayerParsingError("Layer '%s': second input must be softmax layer" % name)
- if dic['numInputs'][1] != model.train_data_provider.get_num_classes():
- raise LayerParsingError("Layer '%s': softmax input '%s' must produce %d outputs, because that is the number of classes in the dataset" \
- % (name, dic['inputs'][1], model.train_data_provider.get_num_classes()))
-
- print "Initialized logistic regression cost '%s' on GPUs %s" % (name, dic['gpus'])
- return dic
-
-class BinomialCrossEntCostParser(CostParser):
- def __init__(self):
- CostParser.__init__(self, num_inputs=2)
-
- def add_params(self, mcp):
- CostParser.add_params(self, mcp)
- self.dic['posWeight'] = mcp.safe_get_float(self.dic['name'], 'posWeight', default=1.0)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = CostParser.parse(self, name, mcp, prev_layers, model)
-
- if dic['numInputs'][0] != dic['numInputs'][1]:
- raise LayerParsingError("Layer '%s': both inputs must produce the same number of outputs" % (name))
-
- if 'neuron' not in dic['inputLayers'][1] or dic['inputLayers'][1]['neuron'] != 'logistic':
- print "WARNING: Layer '%s': input '%s' is not logistic, results may not be what you intend." % (dic['name'], dic['inputs'][1])
-
- if dic['type'] == 'cost.bce':
- print "Initialized binomial cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
-
-
- dic['computeSoftmaxErrorRate'] = True
- return dic
-
-class DetectionCrossEntCostParser(BinomialCrossEntCostParser):
- def __init__(self):
- BinomialCrossEntCostParser.__init__(self)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = BinomialCrossEntCostParser.parse(self, name, mcp, prev_layers, model)
- if dic['numInputs'][0] != model.train_data_provider.get_num_classes(): # first input must be labels
- raise LayerParsingError("Layer '%s': Dimensionality of first input must be equal to number of labels" % name)
- dic['computeSoftmaxErrorRate'] = False
- dic['outputFilter'] = 'lambda costs,num_cases: [c/num_cases for c in costs[:2]] + [(class_cost[2] / class_cost[j] if class_cost[j] > 0 else n.inf) for class_cost in [costs[2:][i*3:(i+1)*3] for i in range(len(costs[2:])/3)] for j in range(2)]'
- dic['outputFilterFormatter'] = 'lambda self,costs: "(crossent) %.6f, (err) %.6f, " % (costs[0], costs[1]) + ", ".join("(%s) %.6f, %.6f" % (self.train_data_provider.batch_meta["label_names"][i/2-1],costs[i],costs[i+1]) for i in xrange(2, len(costs), 2))'
- print "Initialized detection cross-entropy cost '%s' on GPUs %s" % (name, dic['gpus'])
- return dic
-
-class SumOfSquaresCostParser(CostParser):
- def __init__(self):
- CostParser.__init__(self, num_inputs=1)
-
- def parse(self, name, mcp, prev_layers, model):
- dic = CostParser.parse(self, name, mcp, prev_layers, model)
- print "Initialized sum-of-squares cost '%s' on GPUs %s" % (name, dic['gpus'])
- return dic
-
-# All the layer parsers
-layer_parsers = {'data' : lambda : DataLayerParser(),
- 'fc': lambda : FCLayerParser(),
- 'sfc': lambda : SplitFCLayerParser(),
- 'conv': lambda : ConvLayerParser(),
- 'local': lambda : LocalUnsharedLayerParser(),
- 'softmax': lambda : SoftmaxLayerParser(),
- 'eltsum': lambda : EltwiseSumLayerParser(),
- 'eltmax': lambda : EltwiseMaxLayerParser(),
- 'sum': lambda : SumLayerParser(),
- 'neuron': lambda : NeuronLayerParser(),
- 'pool': lambda : PoolLayerParser(),
- 'cmpool': lambda : CrossMapPoolLayerParser(),
- 'rnorm': lambda : NormLayerParser(NormLayerParser.RESPONSE_NORM),
- 'cnorm': lambda : NormLayerParser(NormLayerParser.CONTRAST_NORM),
- 'cmrnorm': lambda : NormLayerParser(NormLayerParser.CROSSMAP_RESPONSE_NORM),
- 'nailbed': lambda : NailbedLayerParser(),
- 'blur': lambda : GaussianBlurLayerParser(),
- 'href': lambda : HorizontalReflectionLayerParser(),
- 'resize': lambda : ResizeLayerParser(),
- 'rgb2yuv': lambda : RGBToYUVLayerParser(),
- 'rgb2lab': lambda : RGBToLABLayerParser(),
- 'rscale': lambda : RandomScaleLayerParser(),
- 'crop': lambda : CropLayerParser(),
- 'concat': lambda : ConcatentionLayerParser(),
- 'pass': lambda : PassThroughLayerParser(),
- 'dropout': lambda : DropoutLayerParser(),
- 'dropout2': lambda : Dropout2LayerParser(),
- 'cost.logreg': lambda : LogregCostParser(),
- 'cost.crossent': lambda : CrossEntCostParser(),
- 'cost.bce': lambda : BinomialCrossEntCostParser(),
- 'cost.dce': lambda : DetectionCrossEntCostParser(),
- 'cost.sum2': lambda : SumOfSquaresCostParser()}
-
-# All the neuron parsers
-# This isn't a name --> parser mapping as the layer parsers above because neurons don't have fixed names.
-# A user may write tanh[0.5,0.25], etc.
-neuron_parsers = sorted([NeuronParser('ident', 'f(x) = x', uses_acts=False, uses_inputs=False),
- NeuronParser('logistic', 'f(x) = 1 / (1 + e^-x)', uses_acts=True, uses_inputs=False),
- NeuronParser('abs', 'f(x) = |x|', uses_acts=False, uses_inputs=True),
- NeuronParser('relu', 'f(x) = max(0, x)', uses_acts=True, uses_inputs=False),
- NeuronParser('nrelu', 'f(x) = max(0, x) + noise', uses_acts=True, uses_inputs=False),
- NeuronParser('softrelu', 'f(x) = log(1 + e^x)', uses_acts=True, uses_inputs=False),
- NeuronParser('square', 'f(x) = x^2', uses_acts=False, uses_inputs=True),
- NeuronParser('sqrt', 'f(x) = sqrt(x)', uses_acts=True, uses_inputs=False),
- ParamNeuronParser('log[a]', 'f(x) = log(a + x)', uses_acts=False, uses_inputs=True),
- ParamNeuronParser('tanh[a,b]', 'f(x) = a * tanh(b * x)', uses_acts=True, uses_inputs=False),
- ParamNeuronParser('brelu[a]', 'f(x) = min(a, max(0, x))', uses_acts=True, uses_inputs=False),
- ParamNeuronParser('linear[a,b]', 'f(x) = a * x + b', uses_acts=True, uses_inputs=False),
- ParamNeuronParser('drelu[a]', 'f(x) = x - a * tanh(x / a)', uses_acts=False, uses_inputs=True)],
- key=lambda x:x.type)
-
-# Learning rate schedules
-lrs_parsers = sorted([ParamParser('const[fbase]'),
- ParamParser('linear[fbase;ftgtFactor]'),
- ParamParser('exp[fbase;ftgtFactor]'),
- ParamParser('dexp[fbase;ftgtFactor;inumSteps]')])
+++ /dev/null
-# 11% error on CIFAR-10 - layer parameter file
-# Methodology:
-# 1. Train on batches 1-4, use batch 5 for validation.
-# 2. After about 350 epochs, validation error no longer making improvements.
-# 3. Fold in batch 5.
-# 4. Train on batches 1-5 for about 150 more epochs, until the batch 5 error is near the errors for batches 1-4. It takes forever to actually get there but after 150 epochs it's close enough.
-# 5. Lower learning rates (epsW) by a factor of 10 to 0.0001, train for 10 more epochs.
-# 6. Lower learning rates (epsW) by another factor of 10 to 0.00001, train for 10 more epochs.
-# 7. Stop. Test on batch 6 with --test-range=6 --multiview-test=1 --logreg-name=logprob (read more about what this does here: http://code.google.com/p/cuda-convnet/wiki/TrainingNet#Training_on_image_translations )
-
-# More details about methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
-
-[conv1]
-epsW=0.001
-epsB=0.002
-momW=0.9
-momB=0.9
-wc=0.000
-
-[conv2]
-epsW=0.001
-epsB=0.002
-momW=0.9
-momB=0.9
-wc=0.000
-
-[local3]
-epsW=0.001
-epsB=0.002
-momW=0.9
-momB=0.9
-wc=0.004
-
-[local4]
-epsW=0.001
-epsB=0.002
-momW=0.9
-momB=0.9
-wc=0.004
-
-[fc10]
-epsW=0.001
-epsB=0.002
-momW=0.9
-momB=0.9
-wc=0.01
-
-[logprob]
-coeff=1
-
-[rnorm1]
-scale=0.001
-pow=0.75
-
-[rnorm2]
-scale=0.001
-pow=0.75
+++ /dev/null
-[conv1]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv3]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv4]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv5]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096a]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096b]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1000]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[logprob]
-coeff=1
-topk=5
-
-[dropout1]
-enable=true
-
-[dropout2]
-enable=true
-
-[rnorm1]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-
+++ /dev/null
-[conv1]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv3]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv4]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv5]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096a]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096b]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1000]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[logprob]
-coeff=1
-topk=5
-
-[dropout1]
-enable=true
-
-[dropout2]
-enable=true
-
-[rnorm1]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-
+++ /dev/null
-[conv1a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-
-
-[conv1b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-
-
-[conv2a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-
-
-[conv2b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-
-
-[conv3a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-
-[conv3b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-[conv4a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-
-
-[conv4b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-
-
-[conv5a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-
-
-[conv5b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-
-
-[fc2048a]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-[fc2048b]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-[fc2048ba]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-[fc2048bb]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-[fc1000]
-epsW=dexp[base=0.02;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.04;tgtFactor=25;numSteps=2]
-momW=0.9,0.9
-momB=0.9
-wc=0.0005,0.0005
-wball=0,0
-
-
-[logprob]
-coeff=1
-topk=5
-
-[dropout1a]
-enable=true
-keep=0.5
-
-[dropout2a]
-enable=true
-keep=0.5
-
-[dropout1b]
-enable=true
-keep=0.5
-
-[dropout2b]
-enable=true
-keep=0.5
-
-[rnorm1a]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm1b]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2a]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2b]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[cnorm2a]
-scale=0.001
-pow=0.75
-
-[cnorm2b]
-scale=0.001
-pow=0.75
+++ /dev/null
-[conv1]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-
-[conv2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-
-[conv3]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-
-[conv4]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-
-[conv5]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-
-[fc1024a]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024b]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024c]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024d]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024ba]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024bb]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024bc]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1024bd]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-
-[fc1000]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.01;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.02;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[logprob]
-coeff=1
-topk=5
-
-[dropout1a]
-enable=true
-keep=0.5
-
-[dropout1b]
-enable=true
-keep=0.5
-
-[dropout1c]
-enable=true
-keep=0.5
-
-[dropout1d]
-enable=true
-keep=0.5
-
-[dropout2a]
-enable=true
-keep=0.5
-
-[dropout2b]
-enable=true
-keep=0.5
-
-[dropout2c]
-enable=true
-keep=0.5
-
-[dropout2d]
-enable=true
-keep=0.5
-
-[rnorm1]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-
+++ /dev/null
-[conv1]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv2]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0.00
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv3]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv4]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[conv5]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096a]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc4096b]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[fc1000]
-momW=0.9
-momB=0.9
-wc=0.0005
-wball=0
-epsW=dexp[base=0.04;tgtFactor=250;numSteps=4]
-epsB=dexp[base=0.08;tgtFactor=10;numSteps=2]
-updatePeriod=1
-
-[logprob]
-coeff=1
-topk=5
-
-[dropout1]
-enable=true
-
-[dropout2]
-enable=true
-
-[rnorm1]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-[rnorm2]
-scale=0.0001
-pow=0.75
-minDiv=2
-
-
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labels]
-type=data
-dataIdx=1
-
-[conv1]
-type=conv
-inputs=data
-channels=3
-filters=64
-padding=2
-stride=1
-filterSize=5
-neuron=relu
-initW=0.0001
-sumWidth=4
-sharedBiases=1
-gpu=0
-
-[pool1]
-type=pool
-pool=max
-inputs=conv1
-start=0
-sizeX=3
-stride=2
-outputsX=0
-channels=64
-
-[rnorm1]
-type=cmrnorm
-inputs=pool1
-channels=64
-size=9
-
-[conv2]
-type=conv
-inputs=rnorm1
-filters=64
-padding=2
-stride=1
-filterSize=5
-channels=64
-neuron=relu
-initW=0.01
-sumWidth=2
-sharedBiases=1
-
-[rnorm2]
-type=cmrnorm
-inputs=conv2
-channels=64
-size=9
-
-[pool2]
-type=pool
-pool=max
-inputs=rnorm2
-start=0
-sizeX=3
-stride=2
-outputsX=0
-channels=64
-
-[local3]
-type=local
-inputs=pool2
-filters=64
-padding=1
-stride=1
-filterSize=3
-channels=64
-neuron=relu
-initW=0.04
-
-[local4]
-type=local
-inputs=local3
-filters=32
-padding=1
-stride=1
-filterSize=3
-channels=64
-neuron=relu
-initW=0.04
-
-[fc10]
-type=fc
-outputs=10
-inputs=local4
-initW=0.01
-
-[probs]
-type=softmax
-inputs=fc10
-
-[logprob]
-type=cost.logreg
-inputs=labels,probs
-gpu=0
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labvec]
-type=data
-dataIdx=1
-
-[conv1]
-type=conv
-inputs=data
-channels=3
-filters=64
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=4
-sharedBiases=1
-gpu=0
-
-[rnorm1]
-type=cmrnorm
-inputs=conv1
-channels=64
-size=5
-
-[pool1]
-type=pool
-pool=max
-inputs=rnorm1
-sizeX=3
-stride=2
-channels=64
-neuron=relu
-
-[conv2]
-type=conv
-inputs=pool1
-filters=192
-padding=2
-stride=1
-filterSize=5
-channels=64
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[rnorm2]
-type=cmrnorm
-inputs=conv2
-channels=192
-size=5
-
-[pool2]
-type=pool
-pool=max
-inputs=rnorm2
-sizeX=3
-stride=2
-channels=192
-
-[conv3]
-type=conv
-inputs=pool2
-filters=384
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[conv4]
-type=conv
-inputs=conv3
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=384
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=3
-sharedBiases=1
-
-[conv5]
-type=conv
-inputs=conv4
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=256
-initW=0.03
-initB=1
-sumWidth=3
-
-[pool3]
-type=pool
-pool=max
-inputs=conv5
-sizeX=3
-stride=2
-channels=256
-neuron=relu
-
-[fc4096a]
-type=fc
-inputs=pool3
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-gpu=0
-
-[dropout1]
-type=dropout2
-inputs=fc4096a
-
-[fc4096b]
-type=fc
-inputs=dropout1
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-gpu=0
-
-[dropout2]
-type=dropout2
-inputs=fc4096b
-
-[fc1000]
-type=fc
-outputs=1000
-inputs=dropout2
-initW=0.01
-initB=-7
-gpu=0
-
-[probs]
-type=softmax
-inputs=fc1000
-
-[logprob]
-type=cost.logreg
-inputs=labvec,probs
-gpu=0
-
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labvec]
-type=data
-dataIdx=1
-
-[conv1]
-type=conv
-inputs=data
-channels=3
-filters=64
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=4
-sharedBiases=1
-gpu=0,1
-
-[rnorm1]
-type=cmrnorm
-inputs=conv1
-channels=64
-size=5
-
-[pool1]
-type=pool
-pool=max
-inputs=rnorm1
-sizeX=3
-stride=2
-channels=64
-neuron=relu
-
-[conv2]
-type=conv
-inputs=pool1
-filters=192
-padding=2
-stride=1
-filterSize=5
-channels=64
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[rnorm2]
-type=cmrnorm
-inputs=conv2
-channels=192
-size=5
-
-[pool2]
-type=pool
-pool=max
-inputs=rnorm2
-sizeX=3
-stride=2
-channels=192
-
-[conv3]
-type=conv
-inputs=pool2
-filters=384
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[conv4]
-type=conv
-inputs=conv3
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=384
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=3
-sharedBiases=1
-
-[conv5]
-type=conv
-inputs=conv4
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=256
-initW=0.03
-initB=1
-sumWidth=3
-
-[pool3]
-type=pool
-pool=max
-inputs=conv5
-sizeX=3
-stride=2
-channels=256
-neuron=relu
-
-[fc4096a]
-type=fc
-inputs=pool3
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-
-[dropout1]
-type=dropout2
-inputs=fc4096a
-
-[fc4096b]
-type=fc
-inputs=dropout1
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-
-[dropout2]
-type=dropout2
-inputs=fc4096b
-
-[fc1000]
-type=fc
-outputs=1000
-inputs=dropout2
-initW=0.01
-initB=-7
-
-[probs]
-type=softmax
-inputs=fc1000
-
-[logprob]
-type=cost.logreg
-inputs=labvec,probs
-gpu=0,1
-
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labels]
-type=data
-dataIdx=1
-
-[conv1a]
-type=conv
-inputs=data
-channels=3
-filters=48
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=3
-sharedBiases=1
-gpu=0
-
-[conv1b]
-type=conv
-inputs=data
-channels=3
-filters=48
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=3
-sharedBiases=1
-gpu=1
-
-[rnorm1a]
-type=cmrnorm
-inputs=conv1a
-channels=48
-size=5
-
-[rnorm1b]
-type=cmrnorm
-inputs=conv1b
-channels=48
-size=5
-
-[pool1a]
-type=pool
-pool=max
-inputs=rnorm1a
-sizeX=3
-stride=2
-channels=48
-neuron=relu
-
-[pool1b]
-type=pool
-pool=max
-inputs=rnorm1b
-sizeX=3
-stride=2
-channels=48
-neuron=relu
-
-[conv2a]
-type=conv
-inputs=pool1a
-filters=128
-padding=2
-stride=1
-filterSize=5
-channels=48
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-gpu=0
-
-[conv2b]
-type=conv
-inputs=pool1b
-filters=128
-padding=2
-stride=1
-filterSize=5
-channels=48
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-gpu=1
-
-[rnorm2a]
-type=cmrnorm
-inputs=conv2a
-channels=128
-size=5
-
-[rnorm2b]
-type=cmrnorm
-inputs=conv2b
-channels=128
-size=5
-
-[cnorm2a]
-type=rnorm
-inputs=rnorm2a
-channels=128
-size=5
-
-[cnorm2b]
-type=rnorm
-inputs=rnorm2b
-channels=128
-size=5
-
-[pool2a]
-type=pool
-pool=max
-inputs=cnorm2a
-sizeX=3
-stride=2
-channels=128
-
-[pool2b]
-type=pool
-pool=max
-inputs=cnorm2b
-sizeX=3
-stride=2
-channels=128
-
-[conv3a]
-type=conv
-inputs=pool2a,pool2b
-filters=192,192
-padding=1,1
-stride=1,1
-filterSize=3,3
-channels=128,128
-initW=0.03,0.03
-sumWidth=2
-sharedBiases=1
-neuron=relu
-gpu=0
-
-[conv3b]
-type=conv
-inputs=pool2a,pool2b
-filters=192,192
-padding=1,1
-stride=1,1
-filterSize=3,3
-channels=128,128
-initW=0.03,0.03
-sumWidth=2
-sharedBiases=1
-neuron=relu
-gpu=1
-
-[conv4a]
-type=conv
-inputs=conv3a
-filters=192
-padding=1
-stride=1
-filterSize=3
-channels=192
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=2
-sharedBiases=1
-
-[conv4b]
-type=conv
-inputs=conv3b
-filters=192
-padding=1
-stride=1
-filterSize=3
-channels=192
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=2
-sharedBiases=1
-
-
-[conv5a]
-type=conv
-inputs=conv4a
-filters=128
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-initB=1
-sumWidth=2
-groups=1
-randSparse=0
-
-[conv5b]
-type=conv
-inputs=conv4b
-filters=128
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-initB=1
-sumWidth=2
-groups=1
-randSparse=0
-
-[pool3a]
-type=pool
-pool=max
-inputs=conv5a
-sizeX=3
-stride=2
-channels=128
-neuron=relu
-
-[pool3b]
-type=pool
-pool=max
-inputs=conv5b
-sizeX=3
-stride=2
-channels=128
-neuron=relu
-
-[fc2048a]
-type=fc
-inputs=pool3a,pool3b
-outputs=2048
-initW=0.01,0.01
-initB=1
-neuron=relu
-gpu=0
-
-[fc2048b]
-type=fc
-inputs=pool3a,pool3b
-outputs=2048
-initW=0.01,0.01
-initB=1
-neuron=relu
-gpu=1
-
-[dropout1a]
-type=dropout
-inputs=fc2048a
-
-[dropout1b]
-type=dropout
-inputs=fc2048b
-
-[fc2048ba]
-type=fc
-inputs=dropout1a,dropout1b
-outputs=2048
-initW=0.01,0.01
-initB=1
-neuron=relu
-gpu=0
-
-[fc2048bb]
-type=fc
-inputs=dropout1b,dropout1a
-outputs=2048
-initW=0.01,0.01
-initB=1
-neuron=relu
-gpu=1
-
-[dropout2a]
-type=dropout
-inputs=fc2048ba
-
-[dropout2b]
-type=dropout
-inputs=fc2048bb
-
-[fc1000]
-type=fc
-outputs=1000
-inputs=dropout2a,dropout2b
-initW=0.01,0.01
-gpu=0
-
-[probs]
-type=softmax
-inputs=fc1000
-
-[logprob]
-type=cost.logreg
-inputs=labels,probs
-gpu=0
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labvec]
-type=data
-dataIdx=1
-
-[conv1]
-type=conv
-inputs=data
-channels=3
-filters=64
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=4
-sharedBiases=1
-gpu=0,1,2,3
-
-[rnorm1]
-type=cmrnorm
-inputs=conv1
-channels=64
-size=5
-
-[pool1]
-type=pool
-pool=max
-inputs=rnorm1
-sizeX=3
-stride=2
-channels=64
-neuron=relu
-
-[conv2]
-type=conv
-inputs=pool1
-filters=192
-padding=2
-stride=1
-filterSize=5
-channels=64
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[rnorm2]
-type=cmrnorm
-inputs=conv2
-channels=192
-size=5
-
-[pool2]
-type=pool
-pool=max
-inputs=rnorm2
-sizeX=3
-stride=2
-channels=192
-
-[conv3]
-type=conv
-inputs=pool2
-filters=384
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[conv4]
-type=conv
-inputs=conv3
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=384
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=3
-sharedBiases=1
-
-[conv5]
-type=conv
-inputs=conv4
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=256
-initW=0.03
-initB=1
-sumWidth=3
-
-[pool3]
-type=pool
-pool=max
-inputs=conv5
-sizeX=3
-stride=2
-channels=256
-neuron=relu
-
-[fc1024a]
-type=fc
-inputs=pool3
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-gpu=0
-
-[fc1024b]
-type=fc
-inputs=pool3
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-gpu=1
-
-[fc1024c]
-type=fc
-inputs=pool3
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-gpu=2
-
-[fc1024d]
-type=fc
-inputs=pool3
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-gpu=3
-
-[dropout1a]
-type=dropout2
-inputs=fc1024a
-
-[dropout1b]
-type=dropout2
-inputs=fc1024b
-
-[dropout1c]
-type=dropout2
-inputs=fc1024c
-
-[dropout1d]
-type=dropout2
-inputs=fc1024d
-
-# This is like a concatenation layer
-[pass1a]
-type=pass
-inputs=dropout1a,dropout1b,dropout1c,dropout1d
-gpu=0
-
-# This is like a concatenation layer
-[pass1b]
-type=pass
-inputs=dropout1a,dropout1b,dropout1c,dropout1d
-gpu=1
-
-# This is like a concatenation layer
-[pass1c]
-type=pass
-inputs=dropout1a,dropout1b,dropout1c,dropout1d
-gpu=2
-
-# This is like a concatenation layer
-[pass1d]
-type=pass
-inputs=dropout1a,dropout1b,dropout1c,dropout1d
-gpu=3
-
-
-[fc1024ba]
-type=fc
-inputs=pass1a
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-
-[fc1024bb]
-type=fc
-inputs=pass1b
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-
-[fc1024bc]
-type=fc
-inputs=pass1c
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-
-[fc1024bd]
-type=fc
-inputs=pass1d
-outputs=1024
-initW=0.01
-initB=1
-neuron=relu
-
-[dropout2a]
-type=dropout2
-inputs=fc1024ba
-
-[dropout2b]
-type=dropout2
-inputs=fc1024bb
-
-[dropout2c]
-type=dropout2
-inputs=fc1024bc
-
-[dropout2d]
-type=dropout2
-inputs=fc1024bd
-
-[pass2a]
-inputs=dropout2a,dropout2b,dropout2c,dropout2d
-type=pass
-gpu=0
-
-[fc1000]
-type=fc
-outputs=1000
-inputs=pass2a
-initW=0.01
-
-[probs]
-type=softmax
-inputs=fc1000
-
-[logprob]
-type=cost.logreg
-inputs=labvec,probs
-gpu=0
-
+++ /dev/null
-[data]
-type=data
-dataIdx=0
-
-[labvec]
-type=data
-dataIdx=1
-
-[conv1]
-type=conv
-inputs=data
-channels=3
-filters=64
-padding=0
-stride=4
-filterSize=11
-initW=0.01
-sumWidth=4
-sharedBiases=1
-gpu=0,1,2,3
-
-[rnorm1]
-type=cmrnorm
-inputs=conv1
-channels=64
-size=5
-
-[pool1]
-type=pool
-pool=max
-inputs=rnorm1
-sizeX=3
-stride=2
-channels=64
-neuron=relu
-
-[conv2]
-type=conv
-inputs=pool1
-filters=192
-padding=2
-stride=1
-filterSize=5
-channels=64
-initW=0.01
-initB=1
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[rnorm2]
-type=cmrnorm
-inputs=conv2
-channels=192
-size=5
-
-[pool2]
-type=pool
-pool=max
-inputs=rnorm2
-sizeX=3
-stride=2
-channels=192
-
-[conv3]
-type=conv
-inputs=pool2
-filters=384
-padding=1
-stride=1
-filterSize=3
-channels=192
-initW=0.03
-sumWidth=3
-sharedBiases=1
-neuron=relu
-
-[conv4]
-type=conv
-inputs=conv3
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=384
-neuron=relu
-initW=0.03
-initB=1
-sumWidth=3
-sharedBiases=1
-
-[conv5]
-type=conv
-inputs=conv4
-filters=256
-padding=1
-stride=1
-filterSize=3
-channels=256
-initW=0.03
-initB=1
-sumWidth=3
-
-[pool3]
-type=pool
-pool=max
-inputs=conv5
-sizeX=3
-stride=2
-channels=256
-neuron=relu
-
-[fc4096a]
-type=fc
-inputs=pool3
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-
-[dropout1]
-type=dropout2
-inputs=fc4096a
-
-[fc4096b]
-type=fc
-inputs=dropout1
-outputs=4096
-initW=0.01
-initB=1
-neuron=relu
-
-[dropout2]
-type=dropout2
-inputs=fc4096b
-
-[fc1000]
-type=fc
-outputs=1000
-inputs=dropout2
-initW=0.01
-initB=-7
-
-[probs]
-type=softmax
-inputs=fc1000
-
-[logprob]
-type=cost.logreg
-inputs=labvec,probs
-gpu=0,1,2,3
-
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#################################################################################
-
-
-# This script makes batches suitable for training from raw ILSVRC 2012 tar files.
-
-import tarfile
-from StringIO import StringIO
-from random import shuffle
-import sys
-from time import time
-from pyext._MakeDataPyExt import resizeJPEG
-import itertools
-import os
-import cPickle
-import scipy.io
-import math
-import argparse as argp
-
-# Set this to True to crop images to square. In this case each image will be
-# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels, and then the
-# center OUTPUT_IMAGE_SIZE x OUTPUT_IMAGE_SIZE patch will be extracted.
-#
-# Set this to False to preserve image borders. In this case each image will be
-# resized such that its shortest edge is OUTPUT_IMAGE_SIZE pixels. This was
-# demonstrated to be superior by Andrew Howard in his very nice paper:
-# http://arxiv.org/abs/1312.5402
-CROP_TO_SQUARE = True
-OUTPUT_IMAGE_SIZE = 256
-
-# Number of threads to use for JPEG decompression and image resizing.
-NUM_WORKER_THREADS = 8
-
-# Don't worry about these.
-OUTPUT_BATCH_SIZE = 3072
-OUTPUT_SUB_BATCH_SIZE = 1024
-
-def pickle(filename, data):
- with open(filename, "w") as fo:
- cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL)
-
-def unpickle(filename):
- with open(filename) as fo:
- return cPickle.load(fo)
-
-def partition_list(l, partition_size):
- divup = lambda a,b: (a + b - 1) / b
- return [l[i*partition_size:(i+1)*partition_size] for i in xrange(divup(len(l),partition_size))]
-
-def open_tar(path, name):
- if not os.path.exists(path):
- print "ILSVRC 2012 %s not found at %s. Make sure to set ILSVRC_SRC_DIR correctly at the top of this file (%s)." % (name, path, sys.argv[0])
- sys.exit(1)
- return tarfile.open(path)
-
-def makedir(path):
- if not os.path.exists(path):
- os.makedirs(path)
-
-def parse_devkit_meta(ILSVRC_DEVKIT_TAR):
- tf = open_tar(ILSVRC_DEVKIT_TAR, 'devkit tar')
- fmeta = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/meta.mat'))
- meta_mat = scipy.io.loadmat(StringIO(fmeta.read()))
- labels_dic = dict((m[0][1][0], m[0][0][0][0]-1) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000)
- label_names_dic = dict((m[0][1][0], m[0][2][0]) for m in meta_mat['synsets'] if m[0][0][0][0] >= 1 and m[0][0][0][0] <= 1000)
- label_names = [tup[1] for tup in sorted([(v,label_names_dic[k]) for k,v in labels_dic.items()], key=lambda x:x[0])]
-
- fval_ground_truth = tf.extractfile(tf.getmember('ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'))
- validation_ground_truth = [[int(line.strip()) - 1] for line in fval_ground_truth.readlines()]
- tf.close()
- return labels_dic, label_names, validation_ground_truth
-
-def write_batches(target_dir, name, start_batch_num, labels, jpeg_files):
- jpeg_files = partition_list(jpeg_files, OUTPUT_BATCH_SIZE)
- labels = partition_list(labels, OUTPUT_BATCH_SIZE)
- makedir(target_dir)
- print "Writing %s batches..." % name
- for i,(labels_batch, jpeg_file_batch) in enumerate(zip(labels, jpeg_files)):
- t = time()
- jpeg_strings = list(itertools.chain.from_iterable(resizeJPEG([jpeg.read() for jpeg in jpeg_file_batch], OUTPUT_IMAGE_SIZE, NUM_WORKER_THREADS, CROP_TO_SQUARE)))
- batch_path = os.path.join(target_dir, 'data_batch_%d' % (start_batch_num + i))
- makedir(batch_path)
- for j in xrange(0, len(labels_batch), OUTPUT_SUB_BATCH_SIZE):
- pickle(os.path.join(batch_path, 'data_batch_%d.%d' % (start_batch_num + i, j/OUTPUT_SUB_BATCH_SIZE)),
- {'data': jpeg_strings[j:j+OUTPUT_SUB_BATCH_SIZE],
- 'labels': labels_batch[j:j+OUTPUT_SUB_BATCH_SIZE]})
- print "Wrote %s (%s batch %d of %d) (%.2f sec)" % (batch_path, name, i+1, len(jpeg_files), time() - t)
- return i + 1
-
-if __name__ == "__main__":
- parser = argp.ArgumentParser()
- parser.add_argument('--src-dir', help='Directory containing ILSVRC2012_img_train.tar, ILSVRC2012_img_val.tar, and ILSVRC2012_devkit_t12.tar.gz', required=True)
- parser.add_argument('--tgt-dir', help='Directory to output ILSVRC 2012 batches suitable for cuda-convnet to train on.', required=True)
- args = parser.parse_args()
-
- print "CROP_TO_SQUARE: %s" % CROP_TO_SQUARE
- print "OUTPUT_IMAGE_SIZE: %s" % OUTPUT_IMAGE_SIZE
- print "NUM_WORKER_THREADS: %s" % NUM_WORKER_THREADS
-
- ILSVRC_TRAIN_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_train.tar')
- ILSVRC_VALIDATION_TAR = os.path.join(args.src_dir, 'ILSVRC2012_img_val.tar')
- ILSVRC_DEVKIT_TAR = os.path.join(args.src_dir, 'ILSVRC2012_devkit_t12.tar.gz')
-
- assert OUTPUT_BATCH_SIZE % OUTPUT_SUB_BATCH_SIZE == 0
- labels_dic, label_names, validation_labels = parse_devkit_meta(ILSVRC_DEVKIT_TAR)
-
- with open_tar(ILSVRC_TRAIN_TAR, 'training tar') as tf:
- synsets = tf.getmembers()
- synset_tars = [tarfile.open(fileobj=tf.extractfile(s)) for s in synsets]
- print "Loaded synset tars."
- print "Building training set image list (this can take 10-20 minutes)..."
- sys.stdout.flush()
-
- train_jpeg_files = []
- for i,st in enumerate(synset_tars):
- if i % 100 == 0:
- print "%d%% ..." % int(round(100.0 * float(i) / len(synset_tars))),
- sys.stdout.flush()
- train_jpeg_files += [st.extractfile(m) for m in st.getmembers()]
- st.close()
-
- shuffle(train_jpeg_files)
- train_labels = [[labels_dic[jpeg.name[:9]]] for jpeg in train_jpeg_files]
- print "done"
-
- # Write training batches
- i = write_batches(args.tgt_dir, 'training', 0, train_labels, train_jpeg_files)
-
- # Write validation batches
- val_batch_start = int(math.ceil((i / 1000.0))) * 1000
- with open_tar(ILSVRC_VALIDATION_TAR, 'validation tar') as tf:
- validation_jpeg_files = sorted([tf.extractfile(m) for m in tf.getmembers()], key=lambda x:x.name)
- write_batches(args.tgt_dir, 'validation', val_batch_start, validation_labels, validation_jpeg_files)
-
- # Write meta file
- meta = unpickle('input_meta')
- meta_file = os.path.join(args.tgt_dir, 'batches.meta')
- meta.update({'batch_size': OUTPUT_BATCH_SIZE,
- 'num_vis': OUTPUT_IMAGE_SIZE**2 * 3,
- 'label_names': label_names})
- pickle(meta_file, meta)
- print "Wrote %s" % meta_file
- print "All done! ILSVRC 2012 batches are in %s" % args.tgt_dir
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDES := -I./include
-COMMONFLAGS :=
-CC_ARGS :=
-
-ifndef debug
- CC_ARGS += -O3
-endif
-CC=g++
-
-OUT_DIR=./bin/$(OUT_SUFFIX)
-
-PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
-LINK_LIBS := -L$(CUDA_INSTALL_PATH)/lib64 `pkg-config --libs python` `pkg-config --libs opencv` -lpthread
-
-INCLUDES += -I$(PYTHON_INCLUDE_PATH)
-OUT_FILE=_MakeDataPyExt.so
-
-all: dir classes $(OUT_FILE)
-
-dir:
- mkdir -p $(OUT_DIR)/src
-
-SOURCES = $(shell echo src/*.cpp)
-CLASSES = $(SOURCES:.cpp=.o)
-
-classes: $(CLASSES)
-
-%.o: %.cpp
- $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o
-
-$(OUT_FILE): classes
- cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS)
- ln -sf $(OUT_DIR)/$(OUT_FILE) .
-
-clean:
- rm -rf $(OUT_DIR)/*
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import, division, print_function, unicode_literals
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INCLUDE_PYEXT_H_
-#define INCLUDE_PYEXT_H_
-
-#include <stdio.h>
-//#include <jpeglib.h>
-#include <opencv2/opencv.hpp>
-#include <Python.h>
-#include "../../../util/include/thread.h"
-
-#define JPEG_QUALITY 95
-
-#ifndef DIVUP
-#define DIVUP(a,b) (((a) + (b) - 1) / (b))
-#endif
-
-extern "C" {
- void init_MakeDataPyExt();
-}
-PyObject* resizeJPEG(PyObject *self, PyObject *args);
-
-class DecoderThread : public Thread {
- protected:
- PyObject* _py_list_src;
- PyObject* _py_list_tgt;
- int _start_img, _end_img;
- int _target_size;
- bool _crop_to_square;
-
- cv::Mat _resized_mat_buffer;
- std::vector<uchar> _output_jpeg_buffer;
- std::vector<int> _encode_params;
-
- void* run();
- void makeJPEG(int idx);
-
- public:
- DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square);
- virtual ~DecoderThread();
- PyObject* getTargetList();
-};
-
-
-#endif // INCLUDE_PYEXT_H_
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/pyext.h"
-
-using namespace std;
-
-static PyMethodDef _MakeDataPyExtMethods[] = {{ "resizeJPEG", resizeJPEG, METH_VARARGS },
- { NULL, NULL }
-};
-
-void init_MakeDataPyExt() {
- (void) Py_InitModule("_MakeDataPyExt", _MakeDataPyExtMethods);
-}
-
-PyObject* resizeJPEG(PyObject *self, PyObject *args) {
-
- PyListObject* pyListSrc;
- int tgtImgSize, numThreads;
- int cropToSquare;
-
- if (!PyArg_ParseTuple(args, "O!iii",
- &PyList_Type, &pyListSrc,
- &tgtImgSize,
- &numThreads,
- &cropToSquare)) {
- return NULL;
- }
-
- DecoderThread* threads[numThreads];
- int num_imgs = PyList_GET_SIZE(pyListSrc);
- int num_imgs_per_thread = DIVUP(num_imgs, numThreads);
- for (int t = 0; t < numThreads; ++t) {
- int start_img = t * num_imgs_per_thread;
- int end_img = min(num_imgs, (t+1) * num_imgs_per_thread);
-
- threads[t] = new DecoderThread((PyObject*)pyListSrc, start_img, end_img, tgtImgSize, cropToSquare);
- threads[t]->start();
- }
-
- PyObject* pyListTgt = PyList_New(0);
- for (int t = 0; t < numThreads; ++t) {
- threads[t]->join();
- PyList_Append(pyListTgt, threads[t]->getTargetList());
- delete threads[t]; // the thread's list too
- }
-
- return pyListTgt;
-}
-
-DecoderThread::DecoderThread(PyObject* py_list_src, int start_img, int end_img, int target_size, bool crop_to_square)
-: Thread(true), _py_list_src(py_list_src), _start_img(start_img), _end_img(end_img), _target_size(target_size), _crop_to_square(crop_to_square) {
-
- _encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
- _encode_params.push_back(JPEG_QUALITY);
- _py_list_tgt = PyList_New(0);
-}
-
-DecoderThread::~DecoderThread(){
- Py_DECREF(_py_list_tgt);
-}
-
-void* DecoderThread::run() {
- for (int i = _start_img; i < _end_img; ++i) {
- makeJPEG(i);
- }
- return NULL;
-}
-
-PyObject* DecoderThread::getTargetList() {
- return _py_list_tgt;
-}
-
-void DecoderThread::makeJPEG(int idx) {
- /*
- * Decompress JPEG
- */
- PyObject* pySrc = PyList_GET_ITEM(_py_list_src, idx);
- uchar* src = (unsigned char*)PyString_AsString(pySrc);
- size_t src_len = PyString_GET_SIZE(pySrc);
- vector<uchar> src_vec(src, src + src_len);
-
- cv::Mat decoded_mat = cv::imdecode(cv::Mat(src_vec), cv::IMREAD_COLOR);
- assert(decoded_mat.channels() == 3);
-
- /*
- * Resize
- */
- double min_dim = std::min(decoded_mat.size().height, decoded_mat.size().width);
- double scale_factor = _target_size / min_dim;
-
- int new_height = round(scale_factor * decoded_mat.size().height);
- int new_width = round(scale_factor * decoded_mat.size().width);
- assert((new_height == _target_size && new_width >= _target_size)
- || (new_width == _target_size && new_height >= _target_size));
- int interpolation = scale_factor == 1 ? cv::INTER_LINEAR
- : scale_factor > 1 ? cv::INTER_CUBIC : cv::INTER_AREA;
-
- cv::resize(decoded_mat, _resized_mat_buffer, cv::Size(new_width, new_height), 0, 0, interpolation);
-
- /*
- * Conditionally crop and compress JPEG
- */
- if (_crop_to_square) {
- int crop_start_x = (new_width - _target_size) / 2;
- int crop_start_y = (new_height - _target_size) / 2;
- cv::Rect cropRect(crop_start_x, crop_start_y, _target_size, _target_size);
- cv::Mat cropped_mat_buffer = _resized_mat_buffer(cropRect);
- cv::imencode(".jpg", cropped_mat_buffer, _output_jpeg_buffer, _encode_params);
- } else {
- cv::imencode(".jpg", _resized_mat_buffer, _output_jpeg_buffer, _encode_params);
- }
-
- char* output_jpeg_buffer_ptr = reinterpret_cast<char*>(&_output_jpeg_buffer[0]);
- PyObject* pyStr = PyString_FromStringAndSize(output_jpeg_buffer_ptr, _output_jpeg_buffer.size());
- PyList_Append(_py_list_tgt, pyStr);
- Py_DECREF(pyStr);
-}
+++ /dev/null
-################################################################################
-#
-# Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
-#
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users. This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
-#
-################################################################################
-
-# Location of the CUDA Toolkit binaries and libraries
-CUDA_INC_PATH = $(CUDA_INSTALL_PATH)/include
-CUDA_BIN_PATH = $(CUDA_INSTALL_PATH)/bin
-CUDA_LIB_PATH = $(CUDA_INSTALL_PATH)/lib64
-
-# Common binaries
-NVCC = $(CUDA_BIN_PATH)/nvcc
-GCC = g++
-AR = ar
-
-# CUDA code generation flags
-GENCODE_SM35 := -gencode arch=compute_35,code=sm_35
-GENCODE_FLAGS := $(GENCODE_SM35)
-
-LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
-CCFLAGS := -m64
-NVCCFLAGS := -m64
-
-# Debug build flags
-ifeq ($(dbg),1)
- CCFLAGS += -g
- NVCCFLAGS += -g -G
- DBG := debug
-else
- DBG := release
- NVCCFLAGS += -O3
- CCFLAGS += -O3
-endif
-
-# Add profiler output
-ifeq ($(prof),1)
- NVCCFLAGS += --ptxas-options=-v
-endif
-
-TARGETDIR := ./bin/$(DBG)
-OBJDIR := ./obj/$(DBG)
-
-########## USER STUFF ###########
-LDFLAGS += -L../util -lutilpy -lcublas
-INCLUDES := -I$(CUDA_INC_PATH) -I $(CUDA_SDK_PATH)/common/inc -I./include
-
-CUFILES := $(shell find . -name "*.cu")
-CU_DEPS := $(shell find . -name "*.cuh")
-CCFILES := $(shell find . -name "*.cpp")
-C_DEPS := $(shell find . -name "*.h")
-
-NVCCFLAGS += --compiler-options '-fPIC'
-LDFLAGS += -shared
-CCFLAGS += -fPIC
-TARGET := $(TARGETDIR)/libnvmatrix.so
-
-################################################################################
-# Set up target and object files
-################################################################################
-OBJS += $(patsubst %.cpp,$(OBJDIR)/%.cpp.o,$(CCFILES))
-OBJS += $(patsubst %.c,$(OBJDIR)/%.c.o,$(CFILES))
-OBJS += $(patsubst %.cu,$(OBJDIR)/%.cu.o,$(CUFILES))
-
-# Target rules
-all: makedirs $(TARGET)
-
-$(OBJDIR)/%.cu.o : %.cu $(CU_DEPS)
- $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(INCLUDES) -o $@ -c $<
-
-$(OBJDIR)/%.cpp.o : %.cpp $(C_DEPS)
- $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
-
-$(TARGET): $(OBJS)
- $(GCC) $(CCFLAGS) -o $@ $+ $(LDFLAGS)
- ln -sf $(TARGET) .
-
-makedirs:
- mkdir -p $(TARGETDIR)
- mkdir -p $(OBJDIR)/src
-
-clean:
- rm -rf ./obj
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MEMORY_CUH_H_
-#define MEMORY_CUH_H_
-#include <map>
-#include <cuda.h>
-#include <string.h>
-#include <vector>
-#include <assert.h>
-
-#include <helper_cuda.h>
-#include "../../util/include/sync.h"
-#include "nvmatrix_kernels.cuh"
-
-#define GPU_ALLOC_FRACTION 0.95 // Take 95% of available GPU memory
-#define HOST_ALLOC_CHUNK (1UL << 32)
-#define SYNC_ON_FREE true
-#define BUCKET_TYPE unsigned int
-
-// Allocte memory from up to this many buckets higher than desired without subdividing
-#define BUCKET_DIVISION_THRESHOLD 1
-#define NUM_BUCKETS static_cast<int>(sizeof(BUCKET_TYPE) * 8)
-#define CLZ(x) ((x) == 0 ? (NUM_BUCKETS) : __builtin_clz(x))
-#define CEIL_LOG2(x) (NUM_BUCKETS - CLZ(x)) // Ceiling of log base 2 of (x + 1)
-#define LOG_FIRST_BUCKET_SIZE 12
-#define FIRST_BUCKET_SIZE (1 << LOG_FIRST_BUCKET_SIZE) // First bucket is for 4K bytes
-#define GET_ALLOC_BUCKET(size) (CEIL_LOG2(((size) - 1) >> LOG_FIRST_BUCKET_SIZE))
-#define GET_DEALLOC_BUCKET(size) (CEIL_LOG2((size) >> (1 + LOG_FIRST_BUCKET_SIZE)))
-#define GET_BUCKET_SIZE(b) (1UL << (LOG_FIRST_BUCKET_SIZE + b))
-
-#define BUCKET_MASK(b) (1UL << (b))
-#define PREV_BUCKETS_MASK(b) (BUCKET_MASK(b) - 1)
-#define AVAILABLE_NEXT_MASK(b, buckets) ((buckets) & ~PREV_BUCKETS_MASK(b))
-
-/*
- * Returns the "best-matching" available bucket as defined by policy.
- * The two policies are:
- *
- * TAKE_FROM_BIGGEST = true: If a bucket in the range
- * b...{b + BUCKET_DIVISION_THRESHOLD} is available, return the smallest
- * available bucket in that range. Otherwise return the *biggest* available
- * bucket greater than or equal to b.
- *
- * TAKE_FROM_BIGGEST = false: Return the *smallest* available bucket greater
- * than or equal to b.
- *
- * Returns -1 when no satisfactory bucket is available.
- */
-#define TAKE_FROM_BIGGEST true
-#if TAKE_FROM_BIGGEST
-#define GET_AVAILABLE_BUCKET(b, buckets) \
- (-1 + (((AVAILABLE_NEXT_MASK(b, buckets)) \
- & (PREV_BUCKETS_MASK((b) + 1 + BUCKET_DIVISION_THRESHOLD))) \
- /* Smallest bucket >= b */ ? __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets)) \
- /* Biggest bucket >= b */ : CEIL_LOG2(AVAILABLE_NEXT_MASK(b, buckets))))
-#else
-#define GET_AVAILABLE_BUCKET(b, buckets) __builtin_ffs(AVAILABLE_NEXT_MASK(b, buckets))
-#endif
-
-/*
- * Bit get/set/clear.
- */
-#define GET_BIT(x, bit) ((x) & (1 << (bit)))
-#define SET_BIT(x, bit) ((x) |= (1 << (bit)))
-#define CLEAR_BIT(x, bit) ((x) &= ~(1 << (bit)))
-
-typedef struct __align__(512) {
- char data;
-} DataType;
-
-#define SIZE_ROUNDUP(size) (sizeof(DataType) * DIVUP((size), sizeof(DataType)))
-
-class MemorySegment {
- friend class FastMemoryManager;
-protected:
- DataType* _data;
- size_t _size;
- int _deviceID;
- // Resizes itself to _size - size and
- // returns pointer to new memory segment
- MemorySegment* subdivide(size_t size) {
- assert(size < _size);
-// assert(size % sizeof(DataType) == 0);
- _size -= size;
- return new MemorySegment(_data + _size / sizeof(DataType), size, _deviceID);
- }
-
- inline size_t getSize() const {
- return _size;
- }
-public:
- MemorySegment(DataType* data, size_t size, int deviceID) : _data(data), _size(size), _deviceID(deviceID) {
- assert(size % sizeof(DataType) == 0);
- }
- // In some cases size is irrelevant
- template<typename T> MemorySegment(T* data) : _data(reinterpret_cast<DataType*>(data)), _size(0), _deviceID(-1) {
-
- }
-
- template <class T /*= DataType*/>
- inline T* getData() const {
- return reinterpret_cast<T*>(_data);
- }
-
- template <class T /*= DataType*/>
- inline T** getDataPtr() {
- return reinterpret_cast<T**>(&_data);
- }
-
- inline int getDeviceID() const {
- return _deviceID;
- }
-};
-
-class MemoryManager {
-protected:
- static Lock _globalLock;
-public:
- virtual MemoryManager* init() = 0;
- virtual MemorySegment* malloc(size_t size) = 0;
- virtual void free(MemorySegment* mem) = 0;
- virtual ~MemoryManager() {
-
- }
-};
-
-class FastMemoryManager : public MemoryManager {
-protected:
- int _deviceID;
- Lock _lock;
- DataType* _data;
- size_t _size;
- BUCKET_TYPE _buckets; // Bucket availability bit vector
- std::vector<std::vector<MemorySegment*> > _freeSegments; // bucket idx -> vector of segments
-
- static std::map<int, MemoryManager*> _memoryManagers;
-
- virtual void allocateInitialSegment() {
- assert(_deviceID >= 0);
- assert(FIRST_BUCKET_SIZE % sizeof(DataType) == 0);
- checkCudaErrors(cudaSetDevice(_deviceID));
- size_t memFree, memTotal;
- checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal));
- _size = sizeof(DataType) * (size_t(round(double(memFree) * GPU_ALLOC_FRACTION)) / sizeof(DataType));
- printf("FastMemoryManager[%d] allocating %lu-byte initial segment\n", _deviceID, _size);
- checkCudaErrors(cudaMalloc(&_data, _size));
- }
-
- virtual void freeInitialSegment() {
- checkCudaErrors(cudaFree(_data));
- }
-
-public:
- static MemoryManager& getInstance(int deviceID);
- static void destroyInstance(int deviceID);
-
- FastMemoryManager(int deviceID) : _deviceID(deviceID), _data(NULL), _size(0), _buckets(0) {
- }
-
- ~FastMemoryManager() {
- freeInitialSegment();
- for (int i = 0; i < _freeSegments.size(); ++i) {
- for (int j = 0; j < _freeSegments[i].size(); ++j) {
- delete _freeSegments[i][j];
- }
- }
- }
-
- virtual MemoryManager* init() {
- allocateInitialSegment();
-
- for (int i = 0; i < NUM_BUCKETS; ++i) {
- _freeSegments.push_back(std::vector<MemorySegment*>());
- }
- int bucket = GET_DEALLOC_BUCKET(_size);
- SET_BIT(_buckets, bucket);
- _freeSegments[bucket].push_back(new MemorySegment(_data, _size, _deviceID));
- return this;
- }
-
- MemorySegment* malloc(size_t size) {
- assert(size > 0);
- int requestedBucket = GET_ALLOC_BUCKET(size);
- _lock.acquire();
-
- int bucket = GET_AVAILABLE_BUCKET(requestedBucket, _buckets);
-// if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) {
-// printf("MemoryManager[%d] requested size: %lu, requested bucket: %d, available bucket: %d\n", _deviceID, size, requestedBucket, bucket);
-// }
-
- assert(bucket >= requestedBucket); // Out of memory
-
- MemorySegment* sourceSegment = _freeSegments[bucket].back();
- MemorySegment* ret = sourceSegment;
- if (bucket - requestedBucket > BUCKET_DIVISION_THRESHOLD) { // We got a much bigger chunk than we wanted
- ret = sourceSegment->subdivide(GET_BUCKET_SIZE(requestedBucket));
- int newSrcBucket = GET_DEALLOC_BUCKET(sourceSegment->getSize());
- if (newSrcBucket != bucket) {
- _freeSegments[bucket].pop_back();
- _freeSegments[newSrcBucket].push_back(sourceSegment);
- SET_BIT(_buckets, newSrcBucket);
- }
- } else {
- _freeSegments[bucket].pop_back();
- }
- if (_freeSegments[bucket].size() == 0) {
- CLEAR_BIT(_buckets, bucket);
- }
- _lock.release();
- return ret;
- }
-
- void free(MemorySegment* mem) {
- assert(mem != NULL);
- assert(mem->getSize() >= FIRST_BUCKET_SIZE);
- int bucket = GET_DEALLOC_BUCKET(mem->getSize());
- // Synchronize for safety, so that we don't free memory that's being used. Not synchronizing
- // could potentially cause a problem if we re-allocate the just-freed chunk and attempt to
- // use it in a different stream.
- if (SYNC_ON_FREE) {
- int d;
- checkCudaErrors(cudaGetDevice(&d));
- checkCudaErrors(cudaSetDevice(mem->getDeviceID()));
- checkCudaErrors(cudaDeviceSynchronize());
- checkCudaErrors(cudaSetDevice(d));
- }
- _lock.acquire();
- _freeSegments[bucket].push_back(mem);
- SET_BIT(_buckets, bucket);
-// printf("MemoryManager[%d] Freed segment of size %lu into bucket %lu\n", _deviceID, mem->getSize(), bucket);
- _lock.release();
- }
-};
-
-class FastHostMemoryManager : public FastMemoryManager {
-protected:
- static MemoryManager* _memoryManager;
- void allocateInitialSegment() {
- _size = HOST_ALLOC_CHUNK;
- checkCudaErrors(cudaHostAlloc(&_data, _size, cudaHostAllocPortable));
- }
- void freeInitialSegment () {
- checkCudaErrors(cudaFreeHost(_data));
- }
-public:
- FastHostMemoryManager() : FastMemoryManager(DEVICE_HOST) {
- }
-
- static MemoryManager& getInstance();
- static void destroyInstance();
-};
-
-class CUDAMemoryManager : public MemoryManager {
-protected:
- static MemoryManager* _memoryManager;
-
- virtual void _malloc(DataType** data, size_t size) {
- checkCudaErrors(cudaMalloc(data, size));
- }
- virtual void _free(MemorySegment* mem) {
- checkCudaErrors(cudaFree(mem->getData<DataType>()));
- }
-public:
- static MemoryManager& getInstance(int deviceID);
- static void destroyInstance(int deviceID);
- CUDAMemoryManager() {
- }
-
- MemoryManager* init() {
- return this;
- }
-
- MemorySegment* malloc(size_t size) {
- MemorySegment* seg = new MemorySegment(reinterpret_cast<DataType*>(NULL));
- DataType** data = seg->getDataPtr<DataType>();
- _malloc(data, size);
- return seg;
- }
-
- void free(MemorySegment* mem) {
- assert(mem != NULL);
- _free(mem);
- delete mem;
- }
-};
-
-class CUDAHostMemoryManager : public CUDAMemoryManager {
-protected:
- static MemoryManager* _memoryManager;
- void _free(MemorySegment* mem) {
- checkCudaErrors(cudaFreeHost(mem->getData<DataType>()));
- }
- void _malloc(DataType** data, size_t size) {
- checkCudaErrors(cudaHostAlloc(data, size, cudaHostAllocPortable));
- }
-public:
- static MemoryManager& getInstance();
- static void destroyInstance();
- CUDAHostMemoryManager() : CUDAMemoryManager() {
-
- }
-};
-#endif /* MEMORY_CUH_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NVMATRIX_H_
-#define NVMATRIX_H_
-
-#include <map>
-#include <vector>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <curand.h>
-#include <time.h>
-#include <curand_kernel.h>
-
-#include <helper_cuda.h>
-#include "../../util/include/matrix.h"
-#include "nvmatrix_kernels.cuh"
-#include "nvmatrix_operators.cuh"
-#include "memory.cuh"
-
-#ifdef WARNINGS
-#define WARN(msg) printf("WARN: File %s, line %d: %s\n", __FILE__, __LINE__, msg);
-#else
-#define WARN(msg) ;
-#endif
-
-#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
- printf("CURAND Error at %s:%d\n",__FILE__,__LINE__);\
- exit(EXIT_FAILURE);}} while(0)
-
-#define CUBLAS_CALL(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
- printf("CUBLAS Error at %s:%d\n",__FILE__,__LINE__);\
- exit(EXIT_FAILURE);}} while(0)
-
-/*
- * Memory manager to use for GPU memory allocations.
- *
- * CUDAMemoryManager: Default Nvidia memory manager; just calls cudaMalloc / cudaFree.
- * Allocating and freeing memory is slow.
- * FastMemoryManager: A GPU memory manager with very fast (constant time)
- * alloc / free, but possibly more wasteful of memory.
- */
-#define DEVICE_MEMORY_MANAGER CUDAMemoryManager
-
-/*
- * Memory manager to use for host memory allocations.
- *
- * CUDAHostMemoryManager: Default Nvidia memory manager; just calls cudaHostAlloc / cudaFreeHost.
- * Allocating and freeing memory is slow.
- * FastHostMemoryManager: A host memory manager with very fast (constant time)
- * alloc / free, but possibly more wasteful of memory.
- */
-#define HOST_MEMORY_MANAGER CUDAHostMemoryManager
-
-class NVMatrix;
-typedef std::vector<NVMatrix*> NVMatrixV;
-
-class NVMatrix {
-protected:
- int _numCols, _numRows;
- int _numElements;
- int _stride;
-// float* getDevData();
- MemorySegment* _memSegment;
- bool _isTrans;
- bool _ownsData;
- // This flag makes sure that the NVMatrix destructor does nothing
- // when called on HostNVMatrix instance.
- bool _deleted;
- cudaTextureObject_t _texObj;
-
-// static std::map<int,curandGenerator_t> rndGen;
- static std::map<int,MemorySegment*> _rndDevStates;
- static std::map<int,cublasHandle_t> _cublasHandles;
- // Map from device id --> # of random streams initialized on that device
- static std::map<int,int> _rndDevThreads;
- static pthread_mutex_t *_rndMutex, *_cublasMutex, *_streamMutex;
- // Map from device id --> default stream
- static std::map<int,cudaStream_t> _defaultStreams;
-
- cublasOperation_t getTransChar() const {
- /*
- * not a typo! return opposite character because a
- * non-transposed nvmatrix is in row-major order while a non-transposed
- * cublas matrix is in column-major order.
- */
- return _isTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
- }
-
- void _init(bool isTrans);
- void _sum_setParams(int n, dim3* blocks, dim3* threads);
- template<class Agg> float cpuAgg(Agg agg, cudaStream_t stream);
- template<class Agg> float _totalAgg(Agg agg);
- template<class Agg> float _totalAgg(Agg agg, cudaStream_t stream);
- template<class Agg> float _totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream);
- template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp);
- template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream);
- template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop);
- template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream);
- template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop);
- template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream);
- template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop);
- template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream);
- template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop);
-
- template<class Agg, class UnaryOp, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp);
- template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
- template<class Agg, class BinaryOp> void _aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp);
- template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
- template<class Agg, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp);
- template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp);
- template<class Agg, class UnaryOp, class BinaryOp> NVMatrix& _aggregate(int axis, Agg agg, UnaryOp, BinaryOp bop, NVMatrix& tmp);
-
- template <class Randomizer> void _unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream);
- template <class Randomizer> void _unaryRandomize(NVMatrix& target, Randomizer rnd);
- template <class Randomizer> void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd);
- template <class Randomizer> void _binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream);
-
- virtual void alloc(int numElements);
- virtual void dealloc();
- void deallocTexture();
- virtual NVMatrix& construct() const;
- virtual NVMatrix& construct(bool isTrans) const;
- virtual NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const;
- virtual NVMatrix& construct(const Matrix& like, bool copy) const;
- virtual NVMatrix& construct(const NVMatrix& like, bool copy) const;
- virtual NVMatrix& construct(const NVMatrix& like) const;
- virtual NVMatrix& construct(const Matrix& like) const;
- virtual NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const;
- static cublasHandle_t getCublasHandle();
- static cublasHandle_t getCublasHandle(int deviceID);
-public:
- NVMatrix();
- NVMatrix(bool isTrans);
- NVMatrix(int numRows, int numCols, bool isTrans=false);
- NVMatrix(const Matrix& like, bool copy);
- NVMatrix(const NVMatrix& like, bool copy);
- NVMatrix(const NVMatrix& like);
- NVMatrix(const Matrix& like);
- NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans);
- virtual ~NVMatrix();
-
- // Returns the device ID on which the data pointer is allocated
- int getDataDeviceID() const;
- static void initRandom(unsigned long long seed, int numStreams, cudaStream_t stream);
- static void initRandom(unsigned long long seed, int numStreams);
- static void initRandom(unsigned long long seed);
- static void initRandom();
- static void initCublas();
- static void destroyCublas();
- static std::pair<size_t, size_t> getCudaMemorySize();
-
- // Returns the currently-active device ID for calling thread
- static int getDeviceID();
- static void setDeviceID(int d);
- static bool canAccessPeer(int srcDevice, int tgtDevice);
- static bool isRndInitialized();
- static bool isRndInitialized(bool haveLock);
- static curandState* getCurandState();
- static curandState* getCurandState(int numStreams);
- static void destroyRandom();
- static pthread_mutex_t* makeMutex();
- static cudaStream_t getDefaultStream(int deviceID);
- static cudaStream_t getDefaultStream();
- static void syncDevice();
- static void syncStream();
- static void syncStream(cudaStream_t stream);
-
- /*
- * DO NOT DEREFERENCE IN HOST CODE! This is a device memory pointer.
- */
- float* getCellPtr(int i, int j) const {
- if (_isTrans) {
- return &getDevData()[j * _numRows + i];
- }
- return &getDevData()[i * _numCols + j];
- }
-
- bool isSameDims(const Matrix& m) const {
- return m.getNumRows() == _numRows && m.getNumCols() == _numCols;
- }
-
- bool isSameDims(const NVMatrix& m) const {
- return m.getNumRows() == _numRows && m.getNumCols() == _numCols;
- }
-
- int getNumRows() const {
- return _numRows;
- }
-
- int getNumCols() const {
- return _numCols;
- }
-
- int getStride() const {
- return _stride;
- }
-
- int getLeadingDim() const {
- return _isTrans ? _numRows : _numCols;
- }
-
- int getFollowingDim() const {
- return !_isTrans ? _numRows : _numCols;
- }
-
- /*
- * FALSE: Row-major order.
- * TRUE: Column-major order.
- */
- bool isTrans() const {
- return _isTrans;
- }
-
- bool isView() const {
- return !_ownsData;
- }
-
- float* getDevData() const {
- return _memSegment == NULL ? NULL : _memSegment->getData<float>();
- }
-
- MemorySegment& getMemorySegment() const {
- return *_memSegment;
- }
-
- int getNumElements() const {
- return _numElements;
- }
-
- size_t getNumDataBytes() const {
- return size_t(_numElements) * 4;
- }
-
- /*
- * Only use if you know what you're doing!
- * Does not actually transpose matrix.
- */
- void setTrans(bool trans) {
- if (trans != _isTrans) {
- assert(isContiguous());
- _isTrans = trans;
- _stride = getLeadingDim();
- }
- }
-
- /*
- * Only use if you know what you're doing!
- * This toggles whether this object will free its GPU memory when it's destroyed.
- */
- void setIsView(bool isView) {
- _ownsData = !isView;
- }
-
- bool isContiguous() const {
- return _stride == getLeadingDim() || getFollowingDim() == 1;
- }
-
- void truncate() {
- resize(0,0);
- }
-
- virtual cudaTextureObject_t getTextureObject();
-
- virtual void copyFromHost(const Matrix& hostMatrix);
- virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget);
- virtual void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream);
- virtual void copyToHost(Matrix& hostMatrix) const;
- virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget) const;
- virtual void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const;
- void copy(NVMatrix& dest) const;
- void copy(NVMatrix& dest, cudaStream_t stream) const;
- NVMatrix& copy() const;
- void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream);
- void addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB);
- void addProduct(NVMatrix& a, NVMatrix &b);
- void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream);
- void rightMult(NVMatrix &b, float scaleAB, NVMatrix &target);
- void rightMult(NVMatrix &b, NVMatrix &target);
- void rightMult(NVMatrix &b, float scaleAB);
- void randomizeUniform();
- void addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target);
- void addGaussianNoise(float stdev, NVMatrix& target);
- void addGaussianNoise(NVMatrix& stdevs, bool var);
- void addGaussianNoise(NVMatrix& stdevs);
- void addGaussianNoise(float stdev);
- void addGaussianNoise();
- void randomizeGaussian();
- void randomizeGaussian(float stdev);
- void randomizeGaussian(float mean, float stdev);
- void randomizeGaussian(float mean, NVMatrix& stdevs);
- void randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs);
- void randomizeGaussian(NVMatrix& stdevs);
- void randomizeGaussian(NVMatrix& stdevs, NVMatrix& target);
- void binarizeProbs();
- void binarizeProbs(NVMatrix& target);
-
- void biggerThan(NVMatrix& m, NVMatrix& target);
- void biggerThan(NVMatrix& m);
- void biggerThanVector(NVMatrix& vec, NVMatrix& target);
- void biggerThanVector(NVMatrix& vec);
- void equals(NVMatrix& m, NVMatrix& target);
- void equals(NVMatrix& m);
-
- void _checkBounds(int startRow, int endRow, int startCol, int endCol) const;
- NVMatrix& slice(int startRow, int endRow, int startCol, int endCol) const;
- void slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const;
- NVMatrix& sliceRows(int startRow, int endRow) const;
- void sliceRows(int startRow, int endRow, NVMatrix& target) const;
- NVMatrix& sliceCols(int startCol, int endCol) const;
- void sliceCols(int startCol, int endCol, NVMatrix& target) const;
-
- NVMatrixV& splitRows(int numParts);
- NVMatrixV& splitCols(int numParts);
-
- template <class Op> void apply(Op op, NVMatrix& target, cudaStream_t stream) {
- if (!target.isSameDims(*this)) {
- target.resize(*this);
- }
- if (getNumElements() > 0) {
- int height = target.getFollowingDim(), width = target.getLeadingDim();
-
- if (target.isTrans() == isTrans()) {
- if (!isContiguous() || !target.isContiguous()) {
- dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)),
- std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y)));
- dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
- kEltwiseUnaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
- getLastCudaError("kEltwiseUnaryOp: Kernel execution failed");
- } else {
- dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
- dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
- kEltwiseUnaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), _numElements, op);
- getLastCudaError("kEltwiseUnaryOpFlat: Kernel execution failed");
- }
- } else {
- dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ELTWISE_THREADS_X)),
- std::min(NUM_BLOCKS_MAX, DIVUP(height, ELTWISE_THREADS_Y)));
- dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
- bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0);
- // printf("height: %d, width: %d, stride: %d, target stride: %d, check bounds: %d, threads.x: %d, threads.y: %d, blocks.x: %d, blocks.y: %d\n",
- // height, width, getStride(), target.getStride(), checkBounds, threads.x, threads.y, blocks.x, blocks.y);
- if (checkBounds) {
- kEltwiseUnaryOpTrans<Op, true><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
- } else {
- kEltwiseUnaryOpTrans<Op, false><<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), height, width, getStride(), target.getStride(), op);
- }
- getLastCudaError("kEltwiseUnaryOpTrans: Kernel execution failed");
- }
- }
- }
-
- template <class Op> void apply(Op op, cudaStream_t stream) {
- apply(op, *this, stream);
- }
-
- template <class Op> void apply(Op op, NVMatrix& target) {
- apply(op, target, getDefaultStream());
- }
-
- template <class Op> void apply(Op op) {
- apply(op, *this);
- }
-
- template <class Op> void applyBinary(Op op, NVMatrix& b) {
- applyBinary(op, b, *this);
- }
-
- template <class Op> void applyBinary(Op op, NVMatrix& b, NVMatrix& target) {
- applyBinary(op, b, target, getDefaultStream());
- }
-
- template <class Op> void applyBinary(Op op, NVMatrix& b, NVMatrix& target, cudaStream_t stream) {
- assert(this->isSameDims(b));
-
- if (!target.isSameDims(*this)) {
- target.resize(*this);
- }
-
- if (getNumElements() > 0) {
- int height = target.getFollowingDim(), width = target.getLeadingDim();
- if (target.isTrans() == isTrans() && target.isTrans() == b.isTrans()) {
- if (!isContiguous() || !b.isContiguous() || !target.isContiguous()) {
- dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)),
- std::min(128, DIVUP(height, ELTWISE_THREADS_Y)));
- dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
- kEltwiseBinaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width, getStride(),
- b.getStride(), target.getStride(), op);
- } else {
- dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
- dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
- kEltwiseBinaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), _numElements, op);
- }
- getLastCudaError("kEltwiseBinaryOp: Kernel execution failed");
- } else {
-
- dim3 blocks(std::min(128, DIVUP(width, ELTWISE_THREADS_X)),
- std::min(128, DIVUP(height, ELTWISE_THREADS_Y)));
- dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
- // both x here since y divides x
- bool checkBounds = !(width % ELTWISE_THREADS_X == 0 && height % ELTWISE_THREADS_X == 0);
- if (target.isTrans() == isTrans() && target.isTrans() != b.isTrans()) {
- if (checkBounds) {
- kEltwiseBinaryOpTrans<Op,true,false,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
- b.getStride(), target.getStride(), op);
- } else {
- kEltwiseBinaryOpTrans<Op,false,false,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
- b.getStride(), target.getStride(), op);
- }
- } else if (target.isTrans() != isTrans() && target.isTrans() != b.isTrans()) {
- if (checkBounds) {
- kEltwiseBinaryOpTrans<Op,true,true,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
- b.getStride(), target.getStride(), op);
- } else {
- kEltwiseBinaryOpTrans<Op,false,true,false><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), target.getDevData(), height, width,getStride(),
- b.getStride(), target.getStride(), op);
- }
- } else if (target.isTrans() != isTrans() && target.isTrans() == b.isTrans()) {
- if (checkBounds) {
- kEltwiseBinaryOpTrans<Op,true,false,true><<<blocks, threads, 0, stream>>>(b.getDevData(), getDevData(), target.getDevData(), height, width,b.getStride(),
- getStride(), target.getStride(), op);
- } else {
- kEltwiseBinaryOpTrans<Op,false,false,true><<<blocks, threads, 0, stream>>>(b.getDevData(), getDevData(), target.getDevData(), height, width, b.getStride(),
- getStride(), target.getStride(), op);
- }
- }
- getLastCudaError("kEltwiseBinaryOpTrans: Kernel execution failed");
- }
- }
- }
-
- template <class Op> void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target) {
- applyTernary(op, b, c, target, getDefaultStream());
- }
-
- template <class Op> void applyTernary(Op op, NVMatrix& b, NVMatrix& c, NVMatrix& target, cudaStream_t stream) {
- assert(isSameDims(b));
- assert(isSameDims(c));
- // For now ternary ops are only supported for matrices of same transposedness
- assert(isTrans() == b.isTrans());
- assert(isTrans() == c.isTrans());
- if (!target.isSameDims(*this) || target.isTrans() != isTrans()) {
- target.resize(*this);
- }
- if (getNumElements() > 0) {
- int height = target.getFollowingDim(), width = target.getLeadingDim();
- if (!isContiguous() || !b.isContiguous() || !c.isContiguous() || !target.isContiguous()) {
- dim3 blocks(std::min(512, DIVUP(width, ELTWISE_THREADS_X)),
- std::min(512, DIVUP(height, ELTWISE_THREADS_Y)));
- dim3 threads(ELTWISE_THREADS_X, ELTWISE_THREADS_Y);
- kEltwiseTernaryOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), height, width,
- getStride(), b.getStride(), c.getStride(), target.getStride(), op);
- getLastCudaError("kEltwiseTernaryOp: Kernel execution failed");
- } else {
- dim3 threads = dim3(ELTWISE_FLAT_THREADS_X);
- dim3 blocks = dim3(std::min(128, DIVUP(_numElements, ELTWISE_FLAT_THREADS_X)));
- kEltwiseTernaryOpFlat<Op><<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), c.getDevData(), target.getDevData(), _numElements, op);
- getLastCudaError("kEltwiseTernaryOpFlat: Kernel execution failed");
- }
- }
- }
-
- bool resize(int numRows, int numCols, bool trans);
- bool resize(int numRows, int numCols);
- bool resize(const NVMatrix &like);
- bool resize(const Matrix &like);
- void reshape(int numRows, int numCols);
- NVMatrix& reshaped(int numRows, int numCols) const;
- void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol) const;
- void copy(NVMatrix &dest, int srcStartRow, int srcEndRow, int srcStartCol, int srcEndCol, int destStartRow, int destStartCol, cudaStream_t stream) const;
- void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream);
- void add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target);
- void add(NVMatrix& b, float scaleB, NVMatrix& target);
- void add(NVMatrix& b, NVMatrix& target);
- void add(NVMatrix& b, float scaleB);
- void add(NVMatrix& b, float scaleA, float scaleB);
- void add(NVMatrix& b);
- void eltwiseMult(NVMatrix& b);
- void eltwiseMult(NVMatrix& b, NVMatrix& target);
- void eltwiseDivide(NVMatrix& b);
- void eltwiseDivide(NVMatrix& b, NVMatrix& target);
- void squaredDiff(NVMatrix& b);
- void squaredDiff(NVMatrix& b, NVMatrix& target);
- void subtract(NVMatrix& b, NVMatrix& target);
- void subtract(NVMatrix& b);
- void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream);
- void addVector(NVMatrix& vec, float scaleVec, NVMatrix& target);
- void addVector(NVMatrix& vec);
- void addVector(NVMatrix& vec, float scaleVec);
- void addVector(NVMatrix& vec, NVMatrix& target);
- void equalsVector(NVMatrix& vec, NVMatrix& target);
- void equalsVector(NVMatrix& vec);
- void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream);
- void eltwiseMultByVector(NVMatrix& vec, NVMatrix& target);
- void eltwiseMultByVector(NVMatrix& vec);
- void eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream);
- void eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target);
- void eltwiseDivideByVector(NVMatrix& vec);
- void tile(int timesY, int timesX, NVMatrix& target);
- void tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream);
-
- void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum);
- void addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream);
- void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax);
- void addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream);
- void sum(int axis, NVMatrix& target, cudaStream_t stream);
- void sum(int axis, NVMatrix& target);
- void sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp);
- void sum(int axis, NVMatrix& target, NVMatrix& tmp);
- NVMatrix& sum(int axis);
- void max(int axis, NVMatrix& target);
- void max(int axis, NVMatrix& target, NVMatrix& tmp);
- NVMatrix& max(int axis);
- void min(int axis, NVMatrix& target);
- NVMatrix& min(int axis);
- void sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream);
- void sumOfSquares(int axis, NVMatrix& target);
- NVMatrix& sumOfSquares(int axis);
- float mean();
- float sum();
- float sum(NVMatrix& tmpbuf);
- float max();
- float min();
- float countInf();
- float countNan();
- float norm2();
- float norm();
-
- void inRangeInc(float lower, float upper);
- void inRangeInc(float lower, float upper, NVMatrix& target);
- void inRangeExc(float lower, float upper);
- void inRangeExc(float lower, float upper, NVMatrix& target);
- void biggerThanScalar(float scalar);
- void biggerThanScalar(float scalar, NVMatrix& target);
- void smallerThanScalar(float scalar);
- void smallerThanScalar(float scalar, NVMatrix& target);
- void addScalar(float scaleThis, float scalar, NVMatrix& target);
- void addScalar(float scalar, NVMatrix& target);
- void addScalar(float scalar);
- void minWithScalar(float scalar, NVMatrix& target);
- void minWithScalar(float scalar);
- void maxWithScalar(float scalar, NVMatrix& target);
- void maxWithScalar(float scalar);
- void pow(float p, NVMatrix& target);
- void pow(float p);
- void scale(float _scale);
- void scale(float _scale, NVMatrix& target);
- void scale(float _scale, NVMatrix& target, cudaStream_t stream);
- void scale(float _scale, cudaStream_t stream);
- void zero();
- void zero(NVMatrix& like);
-
- float dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream);
- float dotProduct(NVMatrix& b, cudaStream_t stream);
- float dotProduct(NVMatrix& b);
-
- /*
- * Does SOFT transpose and returns result, leaving this matrix unchanged
- */
- NVMatrix& getTranspose();
- NVMatrix& getClone();
-
- /*
- * Does HARD transpose and puts result in target
- */
- void transpose(NVMatrix& target);
-
- /*
- * Does SOFT transpose
- */
- void transpose();
- bool transpose(bool trans);
-
- void flipTrans(NVMatrix& target, cudaStream_t stream);
- void flipTrans(NVMatrix& target);
- NVMatrix& flipTrans();
-
- void print(int startRow, int rows, int startCol, int cols) const;
- void print(int rows, int cols) const;
- void printShape(const char* name) const;
-
- template <class Op> void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target) {
- applyBinaryV(op, vec, target, getDefaultStream());
- }
-
- template <class Op> void applyBinaryV(Op op, NVMatrix& vec, NVMatrix& target, cudaStream_t stream) {
- assert(&target != &vec); // for now
- if (isSameDims(vec)) {
- applyBinary(op, vec, target, stream);
- return;
- }
- assert(vec.getNumRows() == 1 || vec.getNumCols() == 1);
- assert(vec.getNumRows() == _numRows || vec.getNumCols() == _numCols);
- assert(vec.isContiguous());
-
- target.resize(*this); // target must be same orientation as me for now
- int width = getLeadingDim(); //_isTrans ? _numRows : _numCols;
- int height = getFollowingDim(); //_isTrans ? _numCols : _numRows;
- dim3 threads(ADD_VEC_THREADS_X, ADD_VEC_THREADS_Y);
-
- if ((vec.getNumRows() == _numRows && !isTrans()) || (vec.getNumCols() == _numCols && isTrans())) {
- dim3 blocks(std::min(512, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y)));
- kColVectorOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op);
- } else {
- dim3 blocks(std::min(NUM_BLOCKS_MAX, DIVUP(width, ADD_VEC_THREADS_X)), std::min(NUM_BLOCKS_MAX, DIVUP(height, ADD_VEC_THREADS_Y)));
- kRowVectorOp<Op><<<blocks, threads, 0, stream>>>(getDevData(), vec.getDevData(), target.getDevData(), width, height, getStride(), target.getStride(), op);
- }
- getLastCudaError("Kernel execution failed");
- // cudaThreadSynchronize();
- }
-
- template<class UnaryOperator> float argMax(UnaryOperator u) {
- return _totalAgg(NVMatrixAggs::ArgMax<UnaryOperator>(u));
- }
- static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev);
- static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream);
- static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev);
- static void batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB);
-
- static void assertSame(NVMatrixV& a);
-};
-
-class HostNVMatrix : public NVMatrix {
-protected:
- void alloc(int numElements);
- void dealloc();
- NVMatrix& construct() const;
- NVMatrix& construct(bool isTrans) const;
- NVMatrix& construct(int numRows, int numCols, bool isTrans=false) const;
- NVMatrix& construct(const Matrix& like, bool copy) const;
- NVMatrix& construct(const NVMatrix& like, bool copy) const;
- NVMatrix& construct(const NVMatrix& like) const;
- NVMatrix& construct(const Matrix& like) const;
- NVMatrix& construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const;
-public:
- ~HostNVMatrix();
- HostNVMatrix();
- HostNVMatrix(bool isTrans);
- HostNVMatrix(int numRows, int numCols, bool isTrans=false);
- HostNVMatrix(const Matrix& like, bool copy);
- HostNVMatrix(const NVMatrix& like, bool copy);
- HostNVMatrix(const NVMatrix& like);
- HostNVMatrix(const Matrix& like);
- HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans);
- void copyFromHost(const Matrix& hostMatrix);
- void copyFromHost(const Matrix& hostMatrix, bool resizeTarget);
- void copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream);
- void copyToHost(Matrix& hostMatrix) const;
- void copyToHost(Matrix& hostMatrix, bool resizeTarget) const;
- void copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const;
- cudaTextureObject_t getTextureObject();
-};
-
-#endif /* NVMATRIX_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NVMATRIX_KERNEL_H_
-#define NVMATRIX_KERNEL_H_
-
-#include <curand_kernel.h>
-
-#if defined(_WIN64) || defined(_WIN32)
-#define uint unsigned int
-#endif
-
-#define NUM_BLOCKS_MAX 65535
-#define TEXTURE_SIZE_MAX (1<<29)
-
-#define NUM_RND_BLOCKS 96
-#define NUM_RND_THREADS_PER_BLOCK 128
-#define NUM_RND_STREAMS (NUM_RND_BLOCKS * NUM_RND_THREADS_PER_BLOCK)
-
-/*
- * Default grid/block sizes for the various functions.
- */
-#define ADD_BLOCK_SIZE 16
-
-#define NUM_TILE_BLOCKS 4096
-#define NUM_TILE_THREADS_PER_BLOCK 512
-
-#define ELTWISE_THREADS_X 32
-#define ELTWISE_THREADS_Y 8
-
-#define ELTWISE_FLAT_THREADS_X 128
-
-#define NUM_SUM_COLS_THREADS_PER_BLOCK 128
-
-#define AGG_SHORT_ROWS_THREADS_X 32
-#define AGG_SHORT_ROWS_THREADS_Y 8
-#define AGG_SHORT_ROWS_LOOPS_Y 32
-
-#define DP_BLOCKSIZE 512
-#define CPUSUM_MAX 4096
-
-#define ADD_VEC_THREADS_X 64
-#define ADD_VEC_THREADS_Y 4
-
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#define MYMAX(a, b) ((a) > (b) ? (a) : (b))
-
-#ifndef MUL24 // legacy
-#define MUL24(x,y) ((x) * (y))
-#endif
-
-#define AWR_NUM_THREADS 256
-#define WARP_SIZE 32
-#define AWR_NUM_WARPS AWR_NUM_THREADS / WARP_SIZE
-#define AWR_LOG_NUM_THREADS 8
-#define LOG_WARP_SIZE 5
-#define AWR_LOG_NUM_WARPS 3
-
-#define DEVICE_HOST -1
-#define DEVICE_NULL -2
-
-__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight);
-__global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements);
-__global__ void kSetupCurand(curandState *state, unsigned long long seed);
-
-template<typename T>
-__device__ T shfl_down(T a, int b, int c=WARP_SIZE) {
-#if __CUDA_ARCH__ >= 300
- return __shfl_down(a, b, c);
-#else
- return 0;
-#endif
-}
-
-/*
- * For now this is supported only for arrays with the same transposedness.
- */
-template<class Op>
-__global__ void kEltwiseTernaryOp(const float* a, const float* b, const float* c, float* const dest,
- const uint height, const uint width, uint strideA, const uint strideB, const uint strideC,
- const uint strideDest, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
- const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
-
- for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
- for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
- dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x], c[y * strideC + x]);
- }
- }
-}
-
-template<class Op>
-__global__ void kEltwiseTernaryOpFlat(const float* a, const float* b, const float* c, float* const dest, const uint numElements, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
-
- for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
- dest[x] = op(a[x], b[x], c[x]);
- }
-}
-
-
-/*
- * dest here is assumed to be "not transposed" -- height and width correspond to it.
- * b is assumed to be transposed.
- * a can be either transposed or not -- depending on parameter.
- *
- * Performs dest := op(a, b)
- */
-template<class Op, bool checkBounds, bool aTrans, bool reverse>
-__global__ void kEltwiseBinaryOpTrans(const float* a, const float* b, float* const dest,
- const uint height, const uint width,
- const uint strideA, const uint strideB, const uint strideDest, Op op) {
-
- __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1];
-
- // x here because that's how much work we do
- for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) {
- for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) {
- const uint readX = by + threadIdx.x;
- const uint readY = bx + threadIdx.y;
-
- for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
- if (!checkBounds || (readX < height && readY + y < width)) {
- if (aTrans) {
- shmem[threadIdx.x][threadIdx.y + y] = reverse ? op(b[(readY+y) * strideB + readX], a[(readY+y) * strideA + readX])
- : op(a[(readY+y) * strideA + readX], b[(readY+y) * strideB + readX]);
- } else {
- shmem[threadIdx.x][threadIdx.y + y] = b[(readY+y) * strideB + readX];
- }
- }
- }
- __syncthreads();
-
- const uint writeX = bx + threadIdx.x;
- const uint writeY = by + threadIdx.y;
-
- for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
- if(!checkBounds || (writeX < width && writeY + y < height)) {
- if (aTrans) {
- dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x];
- } else {
- dest[(writeY + y) * strideDest + writeX] = reverse ? op(shmem[threadIdx.y + y][threadIdx.x], a[(writeY + y) * strideA + writeX])
- : op(a[(writeY + y) * strideA + writeX], shmem[threadIdx.y + y][threadIdx.x]);
- }
- }
- }
- __syncthreads();
- }
- }
-}
-template<class Op>
-__global__ void kEltwiseBinaryOp(const float* a, const float* b, float* const dest, const uint height, const uint width,
- const uint strideA, const uint strideB, const uint strideDest, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
- const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
-
- for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
- for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
- dest[y * strideDest + x] = op(a[y * strideA + x], b[y * strideB + x]);
- }
- }
-}
-
-template<class Op>
-__global__ void kEltwiseBinaryOpFlat(const float* a, const float* b, float* const dest, const uint numElements, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
-
- for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
- dest[x] = op(a[x], b[x]);
- }
-}
-
-/*
- * dest here is assumed to be "not transposed" -- height and width correspond to it.
- */
-template<class Op, bool checkBounds>
-__global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest,
- const uint height, const uint width,
- const uint strideA, const uint strideDest, Op op) {
-
- __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1];
-
- for (uint by = ELTWISE_THREADS_X * blockIdx.y; by < height; by += ELTWISE_THREADS_X * gridDim.y) {
- for (uint bx = ELTWISE_THREADS_X * blockIdx.x; bx < width; bx += ELTWISE_THREADS_X * gridDim.x) {
- const uint readX = by + threadIdx.x;
- const uint readY = bx + threadIdx.y;
- for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
- if (!checkBounds || (readX < height && readY + y < width)) {
- shmem[threadIdx.x][threadIdx.y + y] = op(a[(readY + y) * strideA + readX]);
- }
- }
- __syncthreads();
-
- const uint writeX = bx + threadIdx.x;
- const uint writeY = by + threadIdx.y;
- for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
- if(!checkBounds || (writeX < width && writeY + y < height)) {
- dest[(writeY + y) * strideDest + writeX] = shmem[threadIdx.y + y][threadIdx.x];
-
- }
- }
- __syncthreads();
- }
- }
-}
-
-template<class Op>
-__global__ void kEltwiseUnaryOpFlat(const float* a, float* const dest, const uint numElements, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_FLAT_THREADS_X + threadIdx.x;
-
- for (uint x = idxX; x < numElements; x += gridDim.x * ELTWISE_FLAT_THREADS_X) {
- dest[x] = op(a[x]);
- }
-}
-
-template<class Op>
-__global__ void kEltwiseUnaryOp(const float* a, float* const dest, const uint height, const uint width,
- const uint strideA, const uint strideDest, Op op) {
- const uint idxX = blockIdx.x * ELTWISE_THREADS_X + threadIdx.x;
- const uint idxY = blockIdx.y * ELTWISE_THREADS_Y + threadIdx.y;
-
- for (uint y = idxY; y < height; y += gridDim.y * ELTWISE_THREADS_Y) {
- for (uint x = idxX; x < width; x += gridDim.x * ELTWISE_THREADS_X) {
- dest[y * strideDest + x] = op(a[y * strideA + x]);
- }
- }
-}
-
-/*
- * Matrix in ROW-MAJOR order!
- */
-template <class Op>
-__global__ void kRowVectorOp(const float* mat, const float* vec, float* const tgtMat, const uint width, const uint height,
- const uint matStride, const uint tgtStride, Op op) {
- __shared__ float shVec[ADD_VEC_THREADS_X];
- const uint bx = ADD_VEC_THREADS_X * blockIdx.x;
- const uint by = ADD_VEC_THREADS_Y * blockIdx.y;
-
- for (uint x = bx; x < width; x += gridDim.x * ADD_VEC_THREADS_X) {
- __syncthreads();
- if (x + threadIdx.x < width && threadIdx.y == 0) {
- shVec[threadIdx.x] = vec[x + threadIdx.x];
- }
- __syncthreads();
-
- if (x + threadIdx.x < width) {
- for (uint y = by + threadIdx.y; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) {
- tgtMat[y * tgtStride + x + threadIdx.x] = op(mat[y * matStride + x + threadIdx.x], shVec[threadIdx.x]);
- }
- }
- }
-}
-
-/*
- * Matrix in ROW-MAJOR order!
- */
-template <class Op>
-__global__ void kColVectorOp(float* mat, float* vec, float* tgtMat,
- const uint width, const uint height,
- const uint matStride, const uint tgtStride, Op op) {
- __shared__ float shVec[ADD_VEC_THREADS_Y];
- const uint by = ADD_VEC_THREADS_Y * blockIdx.y;
- const uint bx = ADD_VEC_THREADS_X * blockIdx.x;
- const uint tidx = ADD_VEC_THREADS_X * threadIdx.y + threadIdx.x;
-
- mat += threadIdx.y * matStride;
- vec += tidx;
- tgtMat += threadIdx.y * tgtStride;
-
- for (uint y = by; y < height; y += gridDim.y * ADD_VEC_THREADS_Y) {
- __syncthreads();
- if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) {
- shVec[tidx] = vec[y];
- }
- __syncthreads();
-
- if (y + threadIdx.y < height) {
- for (uint x = bx + threadIdx.x; x < width; x += gridDim.x * ADD_VEC_THREADS_X) {
- tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[threadIdx.y]);
- }
- }
- }
-}
-
-/*
- * This one gets coalesced reads but computes only a partial sum which
- * must either be summed again (recursively) or summed on the host.
- */
-template<class Agg, class UnaryOp, class BinaryOp, int blockSize>
-__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) {
- const int idxX = blockIdx.x * blockSize*2 + threadIdx.x;
-
- __shared__ float accum[blockSize*2];
-
- matSum += blockIdx.y * sumWidth + blockIdx.x;
- /*
- * Here it's important to make sure that all threads in a block call __syncthreads,
- * so I have even the redundant threads (for which idxX >= width) enter this loop
- * just so that they may call __syncthreads at the appropriate times.
- */
- mat += width * blockIdx.y + idxX;
-
- accum[threadIdx.x] = agg.getBaseValue();
- accum[threadIdx.x + blockSize] = agg.getBaseValue();
- for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) {
- if (idxX < width) {
- accum[threadIdx.x] = uop(mat[0]);
- if(idxX + blockSize < width)
- accum[threadIdx.x + blockSize] = uop(mat[blockSize]);
- }
- if (blockSize >= 512) {
- __syncthreads();
- if (threadIdx.x < 512)
- accum[threadIdx.x] = agg(accum[threadIdx.x], accum[threadIdx.x + 512]);
- }
- if (blockSize >= 256) {
- __syncthreads();
- if (threadIdx.x < 256)
- accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 256]);
- }
- if (blockSize >= 128) {
- __syncthreads();
- if (threadIdx.x < 128)
- accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 128]);
- }
- if (blockSize >= 64) {
- __syncthreads();
- if (threadIdx.x < 64)
- accum[threadIdx.x] = agg(accum[threadIdx.x],accum[threadIdx.x + 64]);
- }
-
- __syncthreads();
- volatile float* myAccum = &accum[threadIdx.x];
- if (threadIdx.x < 32) { // executed only by first warp
- myAccum[0] = agg(myAccum[0], myAccum[32]);
- myAccum[0] = agg(myAccum[0], myAccum[16]);
- myAccum[0] = agg(myAccum[0], myAccum[8]);
- myAccum[0] = agg(myAccum[0], myAccum[4]);
- myAccum[0] = agg(myAccum[0], myAccum[2]);
- myAccum[0] = agg(myAccum[0], myAccum[1]);
- }
-
- if (threadIdx.x == 0) {
- matSum[0] = bop(matSum[0], myAccum[0]);
- matSum += gridDim.y * sumWidth;
- }
- __syncthreads();
- mat += width * gridDim.y;
- }
-}
-
-template<class Agg, class BinaryOp>
-__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) {
- const int tidx = threadIdx.x;
-
- __shared__ float accum[AWR_NUM_THREADS];
- volatile float* vMyAccum = &accum[tidx];
- float* myAccum = &accum[tidx];
-
- matSum += blockIdx.y;
- mat += width * blockIdx.y;
-
- for (uint idxY = blockIdx.y; idxY < height; idxY += gridDim.y) {
- myAccum[0] = agg.getBaseValue();
- for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
- myAccum[0] = agg(myAccum[0], mat[x]);
- }
- #pragma unroll
- for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) {
- const uint d = 1 << i;
- __syncthreads();
- if (tidx < d) {
- myAccum[0] = agg(myAccum[0], myAccum[d]);
- }
- }
- __syncthreads();
- if (tidx < WARP_SIZE) {
- #pragma unroll
- for (int i = LOG_WARP_SIZE; i >= 0; i--) {
- const uint d = 1 << i;
- vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]);
- }
-
- if (tidx == 0) {
- matSum[0] = op(matSum[0], vMyAccum[0]);
- matSum += gridDim.y;
- }
- }
- __syncthreads();
- mat += width * gridDim.y;
- }
-}
-
-/*
- * Implements multiscan idea from http://www.moderngpu.com
- * Not really useful for pure reductions but neat nonetheless.
- */
-template<class Agg, class UnaryOp, class BinaryOp>
-__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height,
- Agg agg, UnaryOp uop, BinaryOp bop) {
- const uint tidx = threadIdx.x;
- const uint warpIdx = tidx / WARP_SIZE;
- const uint lane = tidx % WARP_SIZE;
-
- __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS];
- __shared__ float finalAccum[AWR_NUM_WARPS];
-
- float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
- float* myFinalAccum = &finalAccum[tidx];
- //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
- matSum += blockIdx.y;
- mat += width * blockIdx.y;
-
- float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem
- #pragma unroll 32
- for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
- rAccum = agg(rAccum, uop(mat[x]));
- }
- myAccum[0] = rAccum;
-
- // Each warp does a reduction that doesn't require synchronizatoin
- #pragma unroll
- for (uint i = 0; i < LOG_WARP_SIZE; i++) {
- const uint d = 1 << i;
- myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d));
- }
- __syncthreads();
- // The warps write their results
- if (tidx < AWR_NUM_WARPS) {
- //volatile float* vMyFinalAccum = &finalAccum[tidx];
- myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)];
- #pragma unroll
- for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) {
- const uint d = 1 << i;
- myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d));
- }
- if (tidx == 0) {
- matSum[0] = bop(matSum[0], myFinalAccum[0]);
- matSum += gridDim.y;
- }
- }
-}
-
-/*
- * To be used when the rows are <= 64.
- *
- * TODO: try to reduce reg usage. i think this can be made faster too.
- */
-//#define AGG_SHORT_ROWS_LOOPS_X 4
-template <class Agg, class UnaryOp, class BinaryOp, int LOOPS_X, int THREADS_X>
-__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
- const uint shmemX = THREADS_X + 1;
- __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
-
- const uint tidx = threadIdx.y * THREADS_X + threadIdx.x;
- const uint ty = LOOPS_X == 1 ? tidx / width : threadIdx.y; // when loops==1, width is gonna be smaller than block x dim
- const uint tx = LOOPS_X == 1 ? tidx % width : threadIdx.x;
- const uint bidx = blockIdx.y * gridDim.x + blockIdx.x;
- const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
- float* shmemWrite = shmem + MUL24(ty, shmemX) + tx;
- matSum += blockRowIdx + tidx;
-// shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0;
- mat += width * blockRowIdx + MUL24(ty, width) + tx;
- float* shmemWriteZeros = &shmem[MUL24(threadIdx.y,shmemX) + threadIdx.x];
-
- bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ;
-
- if (blockRowIdx < height) {
-#pragma unroll
- for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
- doAgg &= tidx + y + blockRowIdx < height;
- const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height;
-
- shmemWriteZeros[0] = agg.getBaseValue();
- __syncthreads();
-#pragma unroll
- for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) {
-// __syncthreads();
- if (heightIdxOK && x + tx < width) {
- shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
- }
- }
- __syncthreads();
- if (doAgg) {
- /*
- * I tried doing this final sum as a 4-step reduction, with 8 threads
- * per warp participating. It was slightly slower.
- */
- float accum = agg.getBaseValue();
- float* shmemRead = shmem + MUL24(tidx, shmemX);
- // this loops too much if the rows are really short :(
-#pragma unroll
- for (uint i = 0; i < THREADS_X; i++) {
- accum = agg(accum, shmemRead[0]);
- shmemRead++;
- }
- matSum[0] = bop(matSum[0], accum);
- matSum += AGG_SHORT_ROWS_THREADS_Y;
- }
- __syncthreads();
- mat += width * AGG_SHORT_ROWS_THREADS_Y;
- }
- }
-}
-
-template <class Agg, class UnaryOp, class BinaryOp>
-__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
- const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1;
- __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
- const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X);
- const uint tidx = threadIdx.y * AGG_SHORT_ROWS_THREADS_X + threadIdx.x;
-
- const uint bidx = blockIdx.y * gridDim.x + blockIdx.x;
- const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
-
- float* shmemWrite = shmem + MUL24(threadIdx.y, shmemX) + threadIdx.x;
- matSum += blockRowIdx + tidx;
-// shmem[MUL24(threadIdx.y, shmemX) + threadIdx.x] = 0;
- mat += width * blockRowIdx + MUL24(threadIdx.y, width) + threadIdx.x;
-
- bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y;
- if(blockRowIdx < height) {
- for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
- doAgg &= tidx + y + blockRowIdx < height;
- const bool heightIdxOK = threadIdx.y + y + blockRowIdx < height;
- float accum = agg.getBaseValue();
- shmemWrite[0] = agg.getBaseValue();
-
- for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) {
-// __syncthreads();
- if (heightIdxOK && x + threadIdx.x < width) {
- shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
- }
- }
-
- __syncthreads();
- if (doAgg) {
- float* shmemRead = shmem + MUL24(tidx, shmemX);
-
-#pragma unroll
- for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) {
- accum = agg(accum, shmemRead[0]);
- shmemRead++;
- }
-
- matSum[0] = bop(matSum[0], accum);
- matSum += AGG_SHORT_ROWS_THREADS_Y;
- }
- __syncthreads();
- mat += width * AGG_SHORT_ROWS_THREADS_Y;
- }
- }
-}
-
-/*
- * Bad when there are few columns.
- */
-template <class Agg, class UnaryOp, class BinaryOp>
-__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
- const uint idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < width) {
- float mx = agg.getBaseValue();
- for (uint j = 0; j < height; j++) {
- mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx);
- }
- vec[idx] = bop(vec[idx], mx);
- }
-}
-
-/*
- * Better with few columns because it only computes a partial sum.
- */
-template <class Agg, class UnaryOp>
-__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) {
- const uint idxX = blockIdx.x * blockDim.x + threadIdx.x;
- const uint idxY = blockIdx.y * sumLength;
- if (idxX < width) {
- float mx = agg.getBaseValue();
- for (uint j = idxY; j < min(height,idxY + sumLength); j++) {
- mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx);
- }
- vec[blockIdx.y * width + idxX] = mx;
- }
-}
-
-template <class Agg>
-__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) {
- __shared__ float shmem[DP_BLOCKSIZE];
- uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
- shmem[threadIdx.x] = agg.getBaseValue();
- if (eidx < gridDim.x * DP_BLOCKSIZE) {
- for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) {
- shmem[threadIdx.x] = agg(shmem[threadIdx.x], a[eidx]);
- }
- }
- __syncthreads();
- if (threadIdx.x < 256) {
- shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 256]);
- }
- __syncthreads();
- if (threadIdx.x < 128) {
- shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 128]);
- }
- __syncthreads();
- if (threadIdx.x < 64) {
- shmem[threadIdx.x] = agg(shmem[threadIdx.x], shmem[threadIdx.x + 64]);
- }
- __syncthreads();
- if (threadIdx.x < 32) {
- volatile float* mysh = &shmem[threadIdx.x];
- *mysh = agg(*mysh, mysh[32]);
- *mysh = agg(*mysh, mysh[16]);
- *mysh = agg(*mysh, mysh[8]);
- *mysh = agg(*mysh, mysh[4]);
- *mysh = agg(*mysh, mysh[2]);
- *mysh = agg(*mysh, mysh[1]);
- if (threadIdx.x == 0) {
- target[blockIdx.x] = *mysh;
- }
- }
-}
-
-class AddGaussianUnaryRandomizer {
-private:
- const float stdev;
-public:
- AddGaussianUnaryRandomizer(float _stdev) : stdev(_stdev) {
- }
- __device__ inline float operator ()(float data, curandState* state) {
- return data + stdev * curand_normal(state);
- }
-};
-
-class BinarizeUnaryRandomizer {
-public:
- __device__ inline float operator ()(float data, curandState* state) {
- return data > curand_uniform(state);
- }
-};
-
-class UniformUnaryRandomizer {
-public:
- __device__ inline float operator ()(float data, curandState* state) {
- return curand_uniform(state);
- }
-};
-
-class GaussianUnaryRandomizer {
-private:
- const float mean, stdev;
-public:
- GaussianUnaryRandomizer(float _mean, float _stdev) : mean(_mean), stdev(_stdev) {
- }
- __device__ inline float operator ()(float data, curandState* state) {
- return mean + stdev * curand_normal(state);
- }
-};
-
-template <bool var>
-class AddGaussianBinaryRandomizer {
-public:
- __device__ inline float operator ()(float data, float stdev, curandState* state) {
- return data + (var ? stdev : 1) * stdev * curand_normal(state);
- }
-};
-
-class GaussianBinaryRandomizer {
-private:
- const float mean;
-public:
- GaussianBinaryRandomizer(float _mean) : mean(_mean) {
- }
- __device__ inline float operator ()(float data, float stdev, curandState* state) {
- return mean + stdev * curand_normal(state);
- }
-};
-
-class ScaledGaussianBinaryRandomizer {
-private:
- const float mean, stdevScale;
-public:
- ScaledGaussianBinaryRandomizer(float _mean, float _stdevScale) : mean(_mean), stdevScale(_stdevScale) {
- }
- __device__ inline float operator ()(float data, float stdev, curandState* state) {
- return mean + stdevScale * stdev * curand_normal(state);
- }
-};
-
-template<class Randomizer>
-__global__ void kUnaryRandomize(float* data, float* targets, curandState* state, const uint numElements, Randomizer rnd) {
- const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
- curandState localState = state[tidx];
-
- for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) {
- targets[i] = rnd(data[i], &localState);
- }
- state[tidx] = localState;
-}
-
-template<class Randomizer>
-__global__ void kBinaryRandomize(float* data, float* data2, float* targets, curandState* state, const uint numElements, Randomizer rnd) {
- const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
- curandState localState = state[tidx];
-
- for (uint i = tidx; i < numElements; i += NUM_RND_STREAMS) {
- targets[i] = rnd(data[i], data2[i], &localState);
- }
- state[tidx] = localState;
-}
-
-#endif /* NVMATRIX_KERNEL_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NVMATRIX_OPERATORS_CUH
-#define NVMATRIX_OPERATORS_CUH
-
-class NVMatrixOps {
-public:
- class Exp {
- public:
- __device__ inline float operator()(const float a) const {
- return __expf(a);
- }
- };
-
- class Logistic {
- public:
- __device__ inline float operator()(const float a) const {
- return __fdividef(1.0f, 1.0f + __expf(-a));
- }
- };
-
- class Log {
- public:
- __device__ inline float operator()(const float a) const {
- return __logf(a);
- }
- };
-
- class Square {
- public:
- __device__ inline float operator()(const float a) const {
- return a * a;
- }
- };
-
- class Sqrt {
- public:
- __device__ inline float operator()(const float a) const {
- return sqrtf(a);
- }
- };
-
- class SqrtAbs {
- public:
- __device__ inline float operator()(const float a) const {
- return sqrtf(fabsf(a));
- }
- };
-
- class Reciprocal {
- public:
- __device__ inline float operator()(const float a) const {
- return 1.0f / a;
- }
- };
-
- class Abs {
- public:
- __device__ inline float operator()(const float a) const {
- return a > 0 ? a : -a;
- }
- };
-
- class Sign {
- public:
- __device__ inline float operator()(const float a) const {
- return (a > 0) - (a < 0);
- }
- };
-
- class Identity {
- public:
- __device__ inline float operator()(const float a) const {
- return a;
- }
- };
-
- class Zero {
- public:
- __device__ inline float operator()(const float a) const {
- return 0;
- }
- };
-
- class One {
- public:
- __device__ inline float operator()(const float a) const {
- return 1;
- }
- };
-
- class Const {
- private:
- const float scalar;
- public:
- Const(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return scalar;
- }
- };
-
- class OneMinus {
- public:
- __device__ inline float operator()(const float x) const {
- return 1.0f - x;
- }
- };
-
- class Linear {
- protected:
- float _a, _b;
- public:
- __device__ inline float operator()(float x) const {
- return _a * x + _b;
- }
- Linear(float a, float b) : _a(a), _b(b) {
- }
- };
-
- class IsNan {
- public:
- __device__ inline float operator()(const float a) const {
- return isnan(a);
- }
- };
-
- class IsInf {
- public:
- __device__ inline float operator()(const float a) const {
- return isinf(a);
- }
- };
-
- class SmallerThanScalar {
- private:
- const float scalar;
- public:
- SmallerThanScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a < scalar;
- }
- };
-
- class BiggerThanScalar {
- private:
- const float scalar;
- public:
- BiggerThanScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a > scalar;
- }
- };
-
- class AddScalar {
- private:
- const float scalar;
- public:
- AddScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a + scalar;
- }
- };
-
- class WeightedAddScalar {
- private:
- const float weight, scalar;
- public:
- WeightedAddScalar(const float _weight, const float _scalar) : weight(_weight), scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return weight * a + scalar;
- }
- };
-
- class MultByScalar {
- private:
- const float scalar;
- public:
- MultByScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a * scalar;
- }
- };
-
- class Pow {
- private:
- const float p;
- public:
- Pow(const float _p) : p(_p) {
- }
- __device__ inline float operator()(const float a) const {
- return __powf(a, p);
- }
- };
-
- template <bool exclusive>
- class InRange {
- private:
- const float lower, upper;
- public:
- InRange(const float _lower, const float _upper) : lower(_lower), upper(_upper) {
- }
- __device__ inline float operator()(const float a) const {
- return exclusive ? a > lower && a < upper : a >= lower && a <= upper;
- }
- };
-
- class MinWithScalar {
- private:
- const float scalar;
- public:
- MinWithScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a > scalar ? scalar : a;
- }
- };
-
- class MaxWithScalar {
- private:
- const float scalar;
- public:
- MaxWithScalar(const float _scalar) : scalar(_scalar) {
- }
- __device__ inline float operator()(const float a) const {
- return a > scalar ? a : scalar;
- }
- };
-};
-
-class NVMatrixBinaryOps {
-public:
- class BinaryOp {
- public:
- };
- class Equals : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a == b;
- }
- };
-
- class BiggerThan : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a > b;
- }
- };
-
- class Divide : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return __fdividef(a, b);
- }
- };
-
- class DivideAccurate : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a / b;
- }
- };
-
- class DivideSafe : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return b == 0 ? 0 : __fdividef(a, b);
- }
- };
-
- class DivideSafeAccurate : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return b == 0 ? 0 : (a / b);
- }
- };
-
- class Multiply : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a * b;
- }
- };
-
- class SquaredDiff : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return (a - b) * (a - b);
- }
- };
-
- class WeightedAdd : public BinaryOp {
- private:
- const float scaleA, scaleB;
- public:
- WeightedAdd(const float _scaleA, const float _scaleB) : scaleA(_scaleA), scaleB(_scaleB) {
- }
- WeightedAdd() : scaleA(0), scaleB(0) { // Compiler complains about no default constructor?
- }
- __device__ inline float operator()(const float a, const float b) const {
- return a * scaleA + b * scaleB;
- }
- };
-
- class WeightedAdd1 : public BinaryOp {
- private:
- const float scaleB;
- public:
- WeightedAdd1(const float _scaleB) : scaleB(_scaleB) {
- }
- __device__ inline float operator()(const float a, const float b) const {
- return a + b * scaleB;
- }
- };
-
- class ScaledAdd : public BinaryOp {
- private:
- const float scaleB;
- public:
- ScaledAdd(const float _scaleB) : scaleB(_scaleB) {
- }
- __device__ inline float operator()(const float a, const float b) const {
- return a + b * scaleB;
- }
- };
-
- class Add : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a + b;
- }
- };
-
- class First : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a;
- }
- };
-
- class Second : public BinaryOp {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return b;
- }
- };
-
- class SecondScaled : public BinaryOp {
- private:
- const float scale;
- public:
- SecondScaled(const float _scale) : scale(_scale) {
- }
-
- SecondScaled() : scale(0) { // Compiler complains about no default constructor?
- }
- __device__ inline float operator()(const float a, const float b) const {
- return scale * b;
- }
- };
-
- template<class UnaryOp, class BinaryOp>
- class CompositeSecond : public BinaryOp {
- private:
- UnaryOp _uop;
- BinaryOp _bop;
- public:
- CompositeSecond(UnaryOp uop, BinaryOp bop) : _uop(uop), _bop(bop) {
-
- }
- __device__ inline float operator()(const float a, const float b) const {
- return _bop(a, _uop(b));
- }
- };
-};
-
-class NVMatrixAggs {
-public:
- class Sum {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a + b;
- }
- __device__ inline float getBaseValue() {
- return 0;
- }
- };
-
- class Max {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a > b ? a : b;
- }
- __device__ inline float getBaseValue() {
- return -2e38;
- }
- };
-
- class Min {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a > b ? b : a;
- }
- __device__ inline float getBaseValue() {
- return 2e38;
- }
- };
-
- class CountNan {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a + isnan(b);
- }
- __device__ inline float getBaseValue() {
- return 0;
- }
- };
-
- class CountInf {
- public:
- __device__ inline float operator()(const float a, const float b) const {
- return a + isinf(b);
- }
- __device__ inline float getBaseValue() {
- return 0;
- }
- };
-
- template<class UnaryOperator>
- class ArgMax {
- private:
- UnaryOperator u;
- public:
- ArgMax(UnaryOperator _u) : u(_u) {
- }
- __device__ inline float operator()(const float a, const float b) const {
- return u(a) > u(b) ? a : b;
- }
- __device__ inline float getBaseValue() {
- return u.getArgMin();
- }
- };
-};
-
-class NVMatrixTernaryOps {
-public:
- class Add {
- public:
- __device__ inline float operator()(const float a, const float b, const float c) const {
- return a + b + c;
- }
- };
- class WeightedAdd {
- private:
- const float scaleA, scaleB, scaleC;
- public:
- WeightedAdd(const float _scaleA, const float _scaleB, const float _scaleC) : scaleA(_scaleA), scaleB(_scaleB), scaleC(_scaleC) {
- }
- __device__ inline float operator()(const float a, const float b, const float c) const {
- return a * scaleA + b * scaleB + c * scaleC;
- }
- };
-};
-
-#endif /* NVMATRIX_OPERATORS_CUH */
-
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/memory.cuh"
-
-Lock MemoryManager::_globalLock;
-std::map<int, MemoryManager*> FastMemoryManager::_memoryManagers;
-
-MemoryManager& FastMemoryManager::getInstance(int deviceID) {
- _globalLock.acquire();
- if (_memoryManagers.count(deviceID) == 0) {
- _memoryManagers[deviceID] = (new FastMemoryManager(deviceID))->init();
- }
- MemoryManager& ret = *_memoryManagers[deviceID];
- _globalLock.release();
- return ret;
-}
-
-MemoryManager* CUDAMemoryManager::_memoryManager = NULL;
-MemoryManager& CUDAMemoryManager::getInstance(int deviceID) {
- _globalLock.acquire();
- if (_memoryManager == NULL) {
- _memoryManager = new CUDAMemoryManager();
- }
- _globalLock.release();
- return *_memoryManager;
-}
-
-MemoryManager* CUDAHostMemoryManager::_memoryManager = NULL;
-MemoryManager& CUDAHostMemoryManager::getInstance() {
- _globalLock.acquire();
- if (_memoryManager == NULL) {
- _memoryManager = new CUDAHostMemoryManager();
- }
- _globalLock.release();
- return *_memoryManager;
-}
-
-MemoryManager* FastHostMemoryManager::_memoryManager = NULL;
-MemoryManager& FastHostMemoryManager::getInstance() {
- _globalLock.acquire();
- if (_memoryManager == NULL) {
- _memoryManager = (new FastHostMemoryManager())->init();
- }
- _globalLock.release();
- return *_memoryManager;
-}
-
-
-void FastMemoryManager::destroyInstance(int deviceID) {
- _globalLock.acquire();
- if (_memoryManagers.count(deviceID) != 0) {
- delete _memoryManagers[deviceID];
- _memoryManagers.erase(deviceID);
- }
- _globalLock.release();
-}
-
-void FastHostMemoryManager::destroyInstance() {
- _globalLock.acquire();
- if (_memoryManager != NULL) {
- delete _memoryManager;
- _memoryManager = NULL;
- }
- _globalLock.release();
-}
-
-void CUDAMemoryManager::destroyInstance(int deviceID) {
- _globalLock.acquire();
- if (_memoryManager != NULL) {
- delete _memoryManager;
- _memoryManager = NULL;
- }
- _globalLock.release();
-}
-
-void CUDAHostMemoryManager::destroyInstance() {
- _globalLock.acquire();
- if (_memoryManager != NULL) {
- delete _memoryManager;
- _memoryManager = NULL;
- }
- _globalLock.release();
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <set>
-#include <vector>
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <fstream>
-#include <iostream>
-#include <algorithm>
-#include <typeinfo>
-#include <map>
-#include <cuda.h>
-#include <signal.h>
-#include "../include/nvmatrix.cuh"
-#include "../include/nvmatrix_operators.cuh"
-
-using namespace std;
-
-/*
- * Device random number generator pointers.
- */
-//map<int,curandGenerator_t> NVMatrix::rndGen;
-map<int,MemorySegment*> NVMatrix::_rndDevStates;
-map<int,int> NVMatrix::_rndDevThreads;
-pthread_mutex_t* NVMatrix::_rndMutex = makeMutex();
-pthread_mutex_t* NVMatrix::_cublasMutex = makeMutex();
-pthread_mutex_t* NVMatrix::_streamMutex = makeMutex();
-std::map<int,cublasHandle_t> NVMatrix::_cublasHandles;
-std::map<int,cudaStream_t> NVMatrix::_defaultStreams;
-
-pthread_mutex_t* NVMatrix::makeMutex() {
- pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
- pthread_mutex_init(m, NULL);
- return m;
-}
-/*
- Do not call resize in _init because resize is a virtual function
- which is overridden in base class. Since C++ is retarded and unable
- to call overridden functions from constructors, we shall call resize
- separately from every constructor after calling _init.
-*/
-void NVMatrix::_init(bool isTrans) {
- _numRows = 0;
- _numCols = 0;
- _numElements = 0;
- _ownsData = true;
-
- _isTrans = isTrans;
- _memSegment = NULL;
-
- _stride = 0;
- _texObj = 0;
-}
-
-NVMatrix::NVMatrix() : _deleted(false) {
- _init(false);
-}
-
-NVMatrix::NVMatrix(bool isTrans) : _deleted(false) {
- _init(isTrans);
-}
-
-NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) : _deleted(false) {
- _init(isTrans);
- resize(numRows, numCols);
-}
-
-NVMatrix::NVMatrix(const Matrix& like, bool copy) : _deleted(false) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
- if (copy) {
- copyFromHost(like);
- }
-}
-
-NVMatrix::NVMatrix(const NVMatrix& like, bool copy) : _deleted(false) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
- if (copy) {
- like.copy(*this);
- }
-}
-
-/*
- * Initializes NVMatrix with same dimensions as given matrix but
- * does not copy any data.
- */
-NVMatrix::NVMatrix(const NVMatrix& like) : _deleted(false) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
-}
-
-/*
- * Initializes NVMatrix with same dimensions as given matrix but
- * does not copy any data.
- */
-NVMatrix::NVMatrix(const Matrix& like) : _deleted(false) {
- _init(false);
- resize(like.getNumRows(), like.getNumCols());
-}
-
-NVMatrix::NVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) :
- _numRows(numRows),
- _numCols(numCols),
- _numElements(numRows*numCols),
- _ownsData(false),
- _memSegment(mem),
- _isTrans(isTrans),
- _deleted(false),
- _texObj(0) {
- _stride = stride < 0 ? getLeadingDim() : stride;
-}
-
-NVMatrix::~NVMatrix() {
- if (!_deleted) {
- deallocTexture();
- if(_ownsData && _numElements > 0) {
- dealloc();
- } else {
- // dealloc deletes the mem segment. But if this is a view,
- // then we still need to delete the mem segment object.
-// assert(_memSegment == NULL || _memSegment->getSize() == 0);
- delete _memSegment;
- }
- }
-}
-
-void NVMatrix::copyFromHost(const Matrix& hostMatrix) {
- copyFromHost(hostMatrix, false, getDefaultStream());
-}
-
-void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) {
- copyFromHost(hostMatrix, resizeTarget, getDefaultStream());
-}
-
-void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) {
- if (resizeTarget) {
- resize(hostMatrix);
- } else {
- assert(isSameDims(hostMatrix));
- }
- setTrans(hostMatrix.isTrans());
-
- if (getNumElements() > 0) {
- CUBLAS_CALL(cublasSetMatrixAsync(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float),
- hostMatrix.getData(), hostMatrix.getLeadingDim(), getDevData(), _stride, stream));
- syncStream(stream);
- }
-}
-
-void NVMatrix::copyToHost(Matrix& hostMatrix) const {
- copyToHost(hostMatrix, false, getDefaultStream());
-}
-
-void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
- copyToHost(hostMatrix, resizeTarget, getDefaultStream());
-}
-
-void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const {
- if (resizeTarget) {
- hostMatrix.resize(_numRows, _numCols);
- } else {
- assert(isSameDims(hostMatrix));
- }
- hostMatrix.setTrans(_isTrans);
-
- if (getNumElements() > 0) {
- CUBLAS_CALL(cublasGetMatrixAsync(getLeadingDim(),getFollowingDim(), sizeof(float),
- getDevData(), getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim(), stream));
- syncStream(stream);
- }
-}
-
-void NVMatrix::copy(NVMatrix& dest) const {
- copy(dest, getDefaultStream());
-}
-
-void NVMatrix::copy(NVMatrix& dest, cudaStream_t stream) const {
- if (&dest != this) {
- if (!isSameDims(dest)) {
- dest.resize(*this);
- }
- copy(dest, 0, -1, 0, -1, 0, 0, stream);
- }
-}
-
-NVMatrix& NVMatrix::copy() const {
- NVMatrix& c = construct();
- copy(c);
- return c;
-}
-
-void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target) {
- rightMult(b, scaleAB, target, getDefaultStream());
-}
-
-void NVMatrix::rightMult(NVMatrix &b, float scaleAB, NVMatrix &target, cudaStream_t stream) {
-// if(&target != this && &target != &b) {
-// target.resize(_numRows, b.getNumCols());
-// target.setTrans(true);
-// }
- target.addProduct(*this, b, 0, scaleAB, stream);
-}
-
-void NVMatrix::rightMult(NVMatrix &b, float scaleAB) {
- rightMult(b, scaleAB, *this);
-}
-
-void NVMatrix::rightMult(NVMatrix &b, NVMatrix& target) {
- rightMult(b, 1, target);
-}
-
-void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB) {
- addProduct(a, b, scaleThis, scaleAB, getDefaultStream());
-}
-
-/*
- * This will only work if this matrix is in column-major order! In other words,
- * if isTrans() returns true.
- */
-void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b, float scaleThis, float scaleAB, cudaStream_t stream) {
- assert(a.getNumCols() == b.getNumRows());
-
- if (scaleThis == 0) {
- resize(a.getNumRows(), b.getNumCols());
- setTrans(true);
- }
-
- assert(this->getNumRows() == a.getNumRows());
- assert(this->getNumCols() == b.getNumCols());
- assert(_isTrans);
- CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream));
- CUBLAS_CALL(cublasSgemm_v2(getCublasHandle(), a.getTransChar(), b.getTransChar(), a.getNumRows(), b.getNumCols(), a.getNumCols(),
- &scaleAB, a.getDevData(), a.getStride(), b.getDevData(), b.getStride(),
- &scaleThis, getDevData(), getStride()));
-}
-
-void NVMatrix::addProduct(NVMatrix& a, NVMatrix &b) {
- addProduct(a, b, 1, 1);
-}
-
-void NVMatrix::assertSame(NVMatrixV& a) {
- for (int i = 1; i < a.size(); ++i) {
- assert(a[i]->isSameDims(*a[0]));
- assert(a[i]->isTrans() == a[0]->isTrans());
- assert(a[i]->getStride() == a[0]->getStride());
- assert(a[i]->getDataDeviceID() == a[0]->getDataDeviceID());
- }
-}
-
-void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB,
- const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) {
- batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream(), aPtrsDev, bPtrsDev, tgtPtrsDev);
-}
-
-void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB) {
- batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, getDefaultStream());
-}
-
-void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream,
- const float** aPtrsDev, const float** bPtrsDev, float** tgtPtrsDev) {
- assert(a.size() == b.size());
- assert(a.size() == target.size());
- assertSame(a);
- assertSame(b);
- assertSame(target);
-
- const int batch = a.size();
- if (batch > 0) {
- const int rows = a[0]->getNumRows(), inner = a[0]->getNumCols(), cols = b[0]->getNumCols();
-
- assert(inner == b[0]->getNumRows());
- assert(target[0]->getNumRows() == rows);
- assert(target[0]->getNumCols() == cols);
-
- const int lda = a[0]->getStride(), ldb = b[0]->getStride(), ldc = target[0]->getStride();
- cublasOperation_t atrans = a[0]->getTransChar(), btrans = b[0]->getTransChar();
-
- CUBLAS_CALL(cublasSetStream_v2(getCublasHandle(), stream));
- CUBLAS_CALL(cublasSgemmBatched(getCublasHandle(), atrans, btrans, rows, cols, inner, &scaleAB, aPtrsDev, lda, bPtrsDev, ldb, &scaleTarget, tgtPtrsDev, ldc, batch));
- }
-}
-
-void NVMatrix::batchedMatrixMultiply(NVMatrixV& a, NVMatrixV& b, NVMatrixV& target, float scaleTarget, float scaleAB, cudaStream_t stream) {
- assert(a.size() == b.size());
- assert(a.size() == target.size() || target.size() == 0);
-
- const int batch = a.size();
- if (batch > 0) {
- const int rows = a[0]->getNumRows(), cols = b[0]->getNumCols();
-
- const float* aPtrs[batch], *bPtrs[batch], *tgtPtrs[batch];
- for (int i = 0; i < batch; ++i) {
- if (target.size() <= i) {
- target.push_back(new NVMatrix(rows, cols, true));
- }
- aPtrs[i] = a[i]->getDevData();
- bPtrs[i] = b[i]->getDevData();
- tgtPtrs[i] = target[i]->getDevData();
- }
-
-// const float** aPtrsDev, **bPtrsDev;
-// float **tgtPtrsDev;
-// checkCudaErrors(cudaMalloc(&aPtrsDev, batch * sizeof(float*)));
-// checkCudaErrors(cudaMalloc(&bPtrsDev, batch * sizeof(float*)));
-// checkCudaErrors(cudaMalloc(&tgtPtrsDev, batch * sizeof(float*)));
- MemorySegment* aPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
- MemorySegment* bPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
- MemorySegment* tgtPtrsDev = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(batch * sizeof(float*));
-
- checkCudaErrors(cudaMemcpyAsync(aPtrsDev, aPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
- checkCudaErrors(cudaMemcpyAsync(bPtrsDev, bPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
- checkCudaErrors(cudaMemcpyAsync(tgtPtrsDev, tgtPtrs, batch * sizeof(float*), cudaMemcpyHostToDevice, stream));
-
- batchedMatrixMultiply(a, b, target, scaleTarget, scaleAB, stream, const_cast<const float**>(aPtrsDev->getData<float*>()),
- const_cast<const float**>(bPtrsDev->getData<float*>()),
- tgtPtrsDev->getData<float*>());
-
-// checkCudaErrors(cudaFree(aPtrsDev));
-// checkCudaErrors(cudaFree(bPtrsDev));
-// checkCudaErrors(cudaFree(tgtPtrsDev));
- DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(aPtrsDev);
- DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(bPtrsDev);
- DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).free(tgtPtrsDev);
- }
-}
-
-template <class Randomizer>
-void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) {
- _unaryRandomize(target, rnd, getDefaultStream());
-}
-
-template <class Randomizer>
-void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd, cudaStream_t stream) {
- assert(isRndInitialized());
- assert(isContiguous() && target.isContiguous());
- if (!isSameDims(target)) {
- target.resize(*this);
- }
- assert(isTrans() == target.isTrans());
- kUnaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
- getLastCudaError("kUnaryRandomize: Kernel execution failed");
-}
-
-template <class Randomizer>
-void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) {
- _binaryRandomize(data2, target, rnd, getDefaultStream());
-}
-
-template <class Randomizer>
-void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd, cudaStream_t stream) {
- assert(isRndInitialized());
- assert(isContiguous() && data2.isContiguous() && target.isContiguous());
- assert(isSameDims(data2));
- assert(isTrans() == data2.isTrans());
- if (!isSameDims(target)) {
- target.resize(*this);
- }
- assert(isTrans() == target.isTrans());
- kBinaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
- getLastCudaError("kBinaryRandomize: Kernel execution failed");
-}
-
-void NVMatrix::initRandom(unsigned long long seed, int numStreams) {
- NVMatrix::initRandom(seed, numStreams, NVMatrix::getDefaultStream());
-}
-
-void NVMatrix::initRandom(unsigned long long seed, int numStreams, cudaStream_t stream) {
-// printf("init random on device %d\n", getDeviceID());
- pthread_mutex_lock(_rndMutex);
- assert(!isRndInitialized(true));
- int d = getDeviceID();
-// _rndDevStates[d] = NULL;
- _rndDevThreads[d] = numStreams;
- _rndDevStates[d] = DEVICE_MEMORY_MANAGER::getInstance(d).malloc(numStreams * sizeof(curandState));
-// checkCudaErrors(cudaMalloc((void **)&_rndDevStates[d], numStreams * sizeof(curandState)));
- pthread_mutex_unlock(_rndMutex);
- kSetupCurand<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK, 0, stream>>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one
- getLastCudaError("kSetupCurand: Kernel execution failed");
-}
-
-void NVMatrix::initRandom(unsigned long long seed) {
- initRandom(seed, NUM_RND_STREAMS);
-}
-
-void NVMatrix::initRandom() {
- NVMatrix::initRandom(time(0));
-}
-
-void NVMatrix::initCublas() {
- int d = getDeviceID();
- pthread_mutex_lock(_cublasMutex);
- assert(_cublasHandles.count(d) == 0);
- CUBLAS_CALL(cublasCreate(&_cublasHandles[d]));
- // It appears that cublasCreate causes a host -> device copy on stream 0,
- // so we synchronize with it because we run everything else on other
- // streams.
- syncDevice();
- pthread_mutex_unlock(_cublasMutex);
-}
-
-void NVMatrix::destroyCublas() {
- int d = getDeviceID();
- pthread_mutex_lock(_cublasMutex);
- assert(_cublasHandles.count(d) > 0);
- CUBLAS_CALL(cublasDestroy(_cublasHandles[d]));
- _cublasHandles.erase(d);
- pthread_mutex_unlock(_cublasMutex);
-}
-
-cublasHandle_t NVMatrix::getCublasHandle() {
- return getCublasHandle(getDeviceID());
-}
-
-cublasHandle_t NVMatrix::getCublasHandle(int deviceID) {
- pthread_mutex_lock(_cublasMutex);
- assert(_cublasHandles.count(deviceID) > 0);
- cublasHandle_t h = _cublasHandles[deviceID];
- pthread_mutex_unlock(_cublasMutex);
- return h;
-}
-
-cudaStream_t NVMatrix::getDefaultStream() {
- return getDefaultStream(NVMatrix::getDeviceID());
-}
-
-cudaStream_t NVMatrix::getDefaultStream(int deviceID) {
- if (deviceID >= 0) {
- pthread_mutex_lock(_streamMutex);
- if (_defaultStreams.count(deviceID) == 0) {
- int oldDeviceID = getDeviceID();
- NVMatrix::setDeviceID(deviceID);
- checkCudaErrors(cudaStreamCreateWithFlags(&_defaultStreams[deviceID], cudaStreamNonBlocking));
- NVMatrix::setDeviceID(oldDeviceID);
- }
- cudaStream_t s = _defaultStreams[deviceID];
- pthread_mutex_unlock(_streamMutex);
- return s;
- }
- return 0;
-}
-
-void NVMatrix::syncDevice() {
- checkCudaErrors(cudaDeviceSynchronize());
-}
-
-void NVMatrix::syncStream(cudaStream_t stream) {
- checkCudaErrors(cudaStreamSynchronize(stream));
-}
-
-void NVMatrix::syncStream() {
- syncStream(getDefaultStream());
-}
-
-curandState* NVMatrix::getCurandState() {
- /*
- * Even though we're only reading from the map here, it's important to grab
- * the mutex because another thread may be writing to it.
- */
- pthread_mutex_lock(_rndMutex);
- int d = getDeviceID();
- assert(isRndInitialized(true));
- curandState* r = _rndDevStates[d]->getData<curandState>();
- pthread_mutex_unlock(_rndMutex);
- return r;
-}
-
-curandState* NVMatrix::getCurandState(int numStreams) {
- int d = getDeviceID();
- pthread_mutex_lock(_rndMutex);
- assert(isRndInitialized(true));
- bool realloc = numStreams > _rndDevThreads[d];
- pthread_mutex_unlock(_rndMutex);
-
- if (realloc) {
- destroyRandom();
- initRandom(time(0), numStreams);
- }
- return getCurandState();
-}
-
-int NVMatrix::getDataDeviceID() const {
- if (getDevData() == NULL) {
- return DEVICE_NULL;
- }
- struct cudaPointerAttributes atts;
- checkCudaErrors(cudaPointerGetAttributes(&atts, getDevData()));
- return atts.memoryType == cudaMemoryTypeDevice ? atts.device : DEVICE_HOST;
-}
-
-
-int NVMatrix::getDeviceID() {
- int d;
- checkCudaErrors(cudaGetDevice(&d));
-// if (d == 0) {
-// raise(SIGABRT);
-// }
- return d;
-}
-
-void NVMatrix::setDeviceID(int d) {
- assert(d >= 0);
-// printf("Setting device to %d\n", d);
-// if (d == 0) {
-// raise(SIGABRT);
-// }
- checkCudaErrors(cudaSetDevice(d));
-}
-
-bool NVMatrix::canAccessPeer(int srcDevice, int tgtDevice) {
- if (srcDevice == tgtDevice) {
- return true;
- }
- int canAccess;
- checkCudaErrors(cudaDeviceCanAccessPeer(&canAccess, srcDevice, tgtDevice));
- return canAccess;
-}
-
-bool NVMatrix::isRndInitialized(bool haveLock) {
- if (!haveLock) {
- pthread_mutex_lock(_rndMutex);
- }
- bool b = _rndDevStates.count(getDeviceID()) != 0;
- if (!haveLock) {
- pthread_mutex_unlock(_rndMutex);
- }
- return b;
-}
-
-bool NVMatrix::isRndInitialized() {
- return isRndInitialized(false);
-}
-
-void NVMatrix::destroyRandom() {
- int d = getDeviceID();
- pthread_mutex_lock(_rndMutex);
- assert(isRndInitialized(true));
-// checkCudaErrors(cudaFree(_rndDevStates[d]));
- DEVICE_MEMORY_MANAGER::getInstance(d).free(_rndDevStates[d]);
- _rndDevStates.erase(d);
- _rndDevThreads.erase(d);
- pthread_mutex_unlock(_rndMutex);
-}
-
-void NVMatrix::binarizeProbs() {
- binarizeProbs(*this);
-}
-
-void NVMatrix::binarizeProbs(NVMatrix& target) {
- _unaryRandomize(target, BinarizeUnaryRandomizer());
-}
-
-void NVMatrix::randomizeUniform() {
- assert(isContiguous());
- assert(isRndInitialized());
-// CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements()));
- _unaryRandomize(*this, UniformUnaryRandomizer());
-}
-
-void NVMatrix::randomizeGaussian() {
- randomizeGaussian(1);
-}
-
-void NVMatrix::randomizeGaussian(float stdev) {
- randomizeGaussian(0, stdev);
-}
-
-void NVMatrix::randomizeGaussian(float mean, float stdev) {
- assert(isContiguous());
- assert(isRndInitialized());
-// CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev));
- _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev));
-}
-
-/*
- * Kind of a hack since we don't actually need the contents of this matrix for it,
- * so we don't really need a binary randomizer.
- */
-void NVMatrix::randomizeGaussian(NVMatrix& stdevs) {
- randomizeGaussian(0, stdevs);
-}
-
-void NVMatrix::randomizeGaussian(float mean, NVMatrix& stdevs) {
- _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer(mean));
-}
-
-void NVMatrix::randomizeGaussian(float mean, float stdevMult, NVMatrix& stdevs) {
- _binaryRandomize(stdevs, *this, ScaledGaussianBinaryRandomizer(mean, stdevMult));
-}
-
-void NVMatrix::addGaussianNoise() {
- addGaussianNoise(1);
-}
-
-void NVMatrix::addGaussianNoise(float stdev) {
- addGaussianNoise(stdev, *this);
-}
-
-void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) {
- _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev));
-}
-
-void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) {
- addGaussianNoise(stdevs, var, *this);
-}
-
-void NVMatrix::addGaussianNoise(NVMatrix& stdevs) {
- addGaussianNoise(stdevs, false, *this);
-}
-
-void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) {
- if (var) {
- _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<true>());
- } else {
- _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<false>());
- }
-}
-
-void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) {
- applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target);
-}
-
-void NVMatrix::biggerThan(NVMatrix& b) {
- biggerThan(b, *this);
-}
-
-void NVMatrix::equals(NVMatrix& b, NVMatrix& target) {
- applyBinary(NVMatrixBinaryOps::Equals(), b, target);
-}
-
-void NVMatrix::equals(NVMatrix& m) {
- equals(m, *this);
-}
-
-void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) {
- applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target);
-}
-
-void NVMatrix::biggerThanVector(NVMatrix& vec) {
- biggerThanVector(vec, *this);
-}
-
-void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const {
- assert(startRow >= 0 && startRow <= _numRows);
- assert(endRow >= startRow && endRow <= _numRows);
-
- assert(startCol >= 0 && startCol <= _numCols);
- assert(endCol >= startCol && endCol <= _numCols);
-}
-
-/*
- * The only place where stride is supported for now!
- * Will ALWAYS return a view of the original data, sometimes non-contiguous.
- */
-NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const {
- endRow = endRow < 0 ? this->_numRows : endRow;
- endCol = endCol < 0 ? this->_numCols : endCol;
- _checkBounds(startRow, endRow, startCol, endCol);
-
- if (!isTrans()) {
- return construct(new MemorySegment(this->getDevData() + startRow * _stride + startCol), endRow - startRow, endCol - startCol, _stride, false);
- }
- return construct(new MemorySegment(this->getDevData() + startCol * _stride + startRow), endRow - startRow, endCol - startCol, _stride, true);
-}
-
-/* this will NEVER return a view */
-void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const {
- endRow = endRow < 0 ? this->_numRows : endRow;
- endCol = endCol < 0 ? this->_numCols : endCol;
- _checkBounds(startRow, endRow, startCol, endCol);
-
- int sliceRows = endRow - startRow, sliceCols = endCol - startCol;
- if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) {
- target.resize(sliceRows, sliceCols);
- }
- this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
-}
-
-NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const {
- return slice(startRow, endRow, 0, -1);
-}
-
-void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const {
- slice(startRow, endRow, 0, -1, target);
-}
-
-NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const {
- return slice(0, -1, startCol, endCol);
-}
-
-void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const {
- slice(0, -1, startCol, endCol, target);
-}
-
-NVMatrixV& NVMatrix::splitRows(int numParts) {
- assert(getNumRows() % numParts == 0);
- NVMatrixV& v = *new NVMatrixV();
- int partSize = getNumRows() / numParts;
- for (int p = 0; p < numParts; ++p) {
- v.push_back(&sliceRows(p * partSize, (p+1) * partSize));
- }
- return v;
-}
-
-NVMatrixV& NVMatrix::splitCols(int numParts) {
- assert(getNumCols() % numParts == 0);
- NVMatrixV& v = *new NVMatrixV();
- int partSize = getNumCols() / numParts;
- for (int p = 0; p < numParts; ++p) {
- v.push_back(&sliceCols(p * partSize, (p+1) * partSize));
- }
- return v;
-}
-
-/*
- * Guaranteed to not change the data if the number of elements doesn't change.
- * So you can use this to "reshape" a matrix.
- */
-bool NVMatrix::resize(int numRows, int numCols, bool trans) {
- setTrans(trans);
- bool reallocated = false;
- if (numRows != _numRows || numCols != _numCols) {
- assert(_ownsData || (_numElements == numRows * numCols && isContiguous()));
- if (_numElements != numRows * numCols) {
- if (_numElements > 0) { // free old memory
- dealloc();
- }
- if (numRows * numCols > 0) { // allocate new memory
- alloc(numCols * numRows);
- } else {
- _memSegment = NULL;
- }
- reallocated = true;
- }
- _numRows = numRows;
- _numCols = numCols;
- _numElements = numRows * numCols;
- _stride = getLeadingDim();
- }
- return reallocated;
-}
-
-bool NVMatrix::resize(int numRows, int numCols) {
- return resize(numRows, numCols, isTrans());
-}
-
-bool NVMatrix::resize(const NVMatrix& like) {
- setTrans(like.isTrans());
- return resize(like.getNumRows(), like.getNumCols());
-}
-
-bool NVMatrix::resize(const Matrix& like) {
- setTrans(like.isTrans());
- return resize(like.getNumRows(), like.getNumCols());
-}
-
-void NVMatrix::reshape(int numRows, int numCols) {
- assert(isContiguous());
- assert(_numElements == numRows*numCols);
- _numRows = numRows;
- _numCols = numCols;
- _stride = getLeadingDim();
-}
-
-NVMatrix& NVMatrix::reshaped(int numRows, int numCols) const {
- assert(isContiguous());
- assert(_numElements == numRows*numCols);
- return construct(new MemorySegment(*_memSegment), numRows, numCols, -1, _isTrans);
-}
-
-void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
- int srcStartCol, int srcEndCol,
- int destStartRow, int destStartCol) const {
- copy(dest, srcStartRow, srcEndRow, srcStartCol, srcEndCol, destStartRow, destStartCol, getDefaultStream());
-}
-
-void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
- int srcStartCol, int srcEndCol,
- int destStartRow, int destStartCol, cudaStream_t stream) const {
- srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow;
- srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol;
- NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol);
- NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol);
- if (srcSlice->isContiguous() && destSlice->isContiguous() && srcSlice->isSameDims(*destSlice) && srcSlice->isTrans() == destSlice->isTrans()) {
- // The commonest case.
- checkCudaErrors(cudaMemcpyAsync(destSlice->getDevData(), srcSlice->getDevData(), srcSlice->getNumDataBytes(), cudaMemcpyDefault, stream));
- } else {
- srcSlice->apply(NVMatrixOps::Identity(), *destSlice, stream);
- }
- delete srcSlice;
- delete destSlice;
-}
-
-
-NVMatrix& NVMatrix::getTranspose() {
- return construct(new MemorySegment(*_memSegment), _numCols, _numRows, _stride, !_isTrans);
-}
-
-NVMatrix& NVMatrix::getClone() {
- return construct(new MemorySegment(*_memSegment), _numRows, _numCols, _stride, _isTrans);
-}
-
-void NVMatrix::transpose(NVMatrix& target) {
- flipTrans(target);
- target.setTrans(!target.isTrans());
- target.reshape(target.getNumCols(), target.getNumRows());
-}
-
-void NVMatrix::transpose() {
- int tmp = _numCols;
- _numCols = _numRows;
- _numRows = tmp;
- _isTrans = !_isTrans;
-}
-
-bool NVMatrix::transpose(bool trans) {
- bool oldTrans = _isTrans;
- if (oldTrans != trans) {
- transpose();
- }
- return oldTrans;
-}
-
-/*
- * Flips the ordering of the matrix from row-major to column-major and vice versa.
- * This creates temporary storage -- not a cheap operation.
- *
- * This is not equivalent to a "hard transpose". The resultant matrix still has
- * the same dimensions, its layout in memory just changes.
- */
-NVMatrix& NVMatrix::flipTrans() {
- NVMatrix& meTrans = construct(*this);
- flipTrans(meTrans);
- return meTrans;
-}
-
-void NVMatrix::flipTrans(NVMatrix& target) {
- flipTrans(target, getDefaultStream());
-}
-
-void NVMatrix::flipTrans(NVMatrix& target, cudaStream_t stream) {
- assert(&target != this);
- target.resize(_numRows, _numCols);
- target.setTrans(!isTrans());
-// target.printShape("target");
-// this->printShape("this");
- apply(NVMatrixOps::Identity(), target, stream);
-}
-
-void NVMatrix::squaredDiff(NVMatrix& b) {
- squaredDiff(b, *this);
-}
-
-void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) {
- applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target);
-}
-
-void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) {
- add(b, scaleA, scaleB, target, NVMatrix::getDefaultStream());
-}
-
-void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target, cudaStream_t stream) {
- if (scaleA == 0) {
- b.scale(scaleB, target, stream);
- } else if (scaleB == 0) {
- scale(scaleA, target, stream);
- } else if (scaleA == 1 && scaleB == 1) { // slight optimization
- applyBinary(NVMatrixBinaryOps::Add(), b, target, stream);
- } else if (scaleA == 1) {
- applyBinary(NVMatrixBinaryOps::WeightedAdd1(scaleB), b, target, stream);
- } else {
- applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target, stream);
- }
-}
-
-void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) {
- add(b, 1, scaleB, target);
-}
-
-void NVMatrix::add(NVMatrix& b, NVMatrix& target) {
- add(b, 1, target);
-}
-
-void NVMatrix::add(NVMatrix& b, float scaleB) {
- add(b, scaleB, *this);
-}
-
-void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) {
- add(b, scaleA, scaleB, *this);
-}
-
-void NVMatrix::add(NVMatrix& b) {
- add(b, 1, *this);
-}
-
-void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) {
- add(b, -1, target);
-}
-
-void NVMatrix::subtract(NVMatrix& b) {
- add(b, -1);
-}
-
-void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) {
- applyBinary(NVMatrixBinaryOps::Multiply(), b, target);
-}
-
-void NVMatrix::eltwiseMult(NVMatrix& b) {
- eltwiseMult(b, *this);
-}
-
-void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) {
- applyBinary(NVMatrixBinaryOps::Divide(), b, target);
-}
-
-void NVMatrix::eltwiseDivide(NVMatrix& b) {
- eltwiseDivide(b, *this);
-}
-
-void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) {
- tile(timesY, timesX, target, getDefaultStream());
-}
-
-void NVMatrix::tile(int timesY, int timesX, NVMatrix& target, cudaStream_t stream) {
- assert(isContiguous() && target.isContiguous());
- assert(timesX > 0 && timesY > 0);
- target.resize(_numRows*timesY, _numCols*timesX);
- target.setTrans(_isTrans);
- if(!isTrans()) {
- kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), _numCols, _numRows, target._numCols, target._numRows);
- } else {
- kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK, 0, stream>>>(getDevData(), target.getDevData(), _numRows, _numCols, target._numRows, target._numCols);
- }
- getLastCudaError("Kernel execution failed");
-}
-
-void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) {
- addVector(vec, scaleVec, target, getDefaultStream());
-}
-
-void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target, cudaStream_t stream) {
- applyBinaryV(NVMatrixBinaryOps::ScaledAdd(scaleVec), vec, target, stream);
-}
-
-void NVMatrix::addVector(NVMatrix& vec) {
- addVector(vec, 1);
-}
-
-void NVMatrix::addVector(NVMatrix& vec, float scaleVec) {
- addVector(vec, scaleVec, *this);
-}
-
-void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) {
- addVector(vec, 1, target);
-}
-
-void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) {
- applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target);
-}
-
-void NVMatrix::equalsVector(NVMatrix& vec) {
- equalsVector(vec, *this);
-}
-
-void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) {
- eltwiseMultByVector(vec, target, getDefaultStream());
-}
-
-void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target, cudaStream_t stream) {
- applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target, stream);
-}
-
-void NVMatrix::eltwiseMultByVector(NVMatrix& vec, cudaStream_t stream) {
- eltwiseMultByVector(vec, *this, stream);
-}
-
-void NVMatrix::eltwiseMultByVector(NVMatrix& vec) {
- eltwiseMultByVector(vec, *this);
-}
-
-void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) {
- eltwiseDivideByVector(vec, *this);
-}
-
-void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) {
- applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target);
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) {
- _aggregate(axis, target, agg, uop, bop, stream, NULL);
-}
-
-/*
- * TODO: this is a mess, fix it. it works pretty fast but it's too ugly.
- * TODO: this function is _really_ bad for very long aggregations of few columns.
- */
-template<class Agg, class UnaryOp, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix* tmp) {
- assert(axis == 0 || axis == 1);
- assert(isContiguous() && target.isContiguous());
- assert(&target != this);
- int width = _isTrans ? _numRows : _numCols;
- int height = _isTrans ? _numCols : _numRows;
-
- target.setTrans(_isTrans);
- assert(width > 0);
- assert(height > 0);
- if((axis == 0 && !_isTrans) || (axis == 1 && _isTrans)) { //col sum
- target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1);
-// int height = getFollowingDim();
- if ((height <= 2048 || width >= 4096)) {
- int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
- assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width);
- assert(numBlocks < NUM_BLOCKS_MAX);
- kDumbAggCols<Agg, UnaryOp, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK, 0, stream>>>(getTextureObject(), target.getDevData(), width, height, agg, uop, bop);
- getLastCudaError("kDumbAggCols: Kernel execution failed");
- } else { // Specialize the case when we have very long columns and few of them
- const int sumLength = 128;
- bool deltmp = tmp == NULL;
- if (tmp == NULL) {
- tmp = new NVMatrix(false);
- }
-
- int numBlocksX = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
- int numBlocksY = DIVUP(height, sumLength);
- tmp->resize(numBlocksY, width);
-
- dim3 blocks(numBlocksX, numBlocksY);
- dim3 threads(NUM_SUM_COLS_THREADS_PER_BLOCK);
- kAggCols<Agg, UnaryOp><<<blocks,threads, 0, stream>>>(getTextureObject(), tmp->getDevData(), width, height, sumLength, agg, uop);
- getLastCudaError("kAggCols: Kernel execution failed");
-
- int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
- kDumbAggCols<Agg, NVMatrixOps::Identity, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK, 0, stream>>>(tmp->getTextureObject(), target.getDevData(), width, numBlocksY, agg, NVMatrixOps::Identity(), bop);
- getLastCudaError("kDumbAggCols: Kernel execution failed");
- if (deltmp) {
- delete tmp;
- }
- }
- } else { // row sum
- target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1);
- if (width > 1) {
- if (height >= 16384) { // linear aggregation
- int numBlocksX = 1;
- int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y);
- int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X;
- int numThreadsY = AGG_SHORT_ROWS_THREADS_Y;
- while (numBlocksY > NUM_BLOCKS_MAX) {
- numBlocksY = DIVUP(numBlocksY,2);
- numBlocksX *= 2;
- }
- dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
- if(width <= 16) {
- if(width <= 4) {
- kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 4><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else if(width <= 8) {
- kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 8><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else if(width <= 12) {
- kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 12><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else {
- kAggShortRows<Agg, UnaryOp, BinaryOp, 1, 16><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- }
- } else if(width <= 32) {
- kAggShortRows<Agg, UnaryOp, BinaryOp, 2, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else if(width <= 48){
- kAggShortRows<Agg, UnaryOp, BinaryOp, 3, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else if(width <= 64){
- kAggShortRows<Agg, UnaryOp, BinaryOp, 4, AGG_SHORT_ROWS_THREADS_X><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- } else {
- kAggShortRows2<Agg, UnaryOp, BinaryOp><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),width, height, agg, uop, bop);
- }
- } else {
- if (width >= 512) {
- // NOTE: this is the only case which I bothered to try to optimize for Kepler
- dim3 threads(AWR_NUM_THREADS);
- dim3 blocks(1, height);
- kAggRows_wholerow_nosync<<<blocks, threads, 0, stream>>>(getDevData(), target.getDevData(), width, height, agg, uop, bop);
- } else {
-
- int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512)));
- int numThreadsY = 1;
- int numBlocksX = DIVUP(width, 2*numThreadsX);
- int numBlocksY = std::min(height, NUM_BLOCKS_MAX);
-
- dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
- assert(numBlocksX <= NUM_BLOCKS_MAX);
- assert(numBlocksY <= NUM_BLOCKS_MAX);
-
- if(width <= 64) {
- kAggRows<Agg, UnaryOp, BinaryOp, 32><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
- width, height, target.getLeadingDim(), agg, uop, bop);
- } else if(width <= 128) {
- kAggRows<Agg, UnaryOp, BinaryOp, 64><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
- width, height, target.getLeadingDim(), agg, uop, bop);
- } else if(width <= 256) {
- kAggRows<Agg, UnaryOp, BinaryOp, 128><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
- width, height, target.getLeadingDim(), agg, uop, bop);
- } else if(width <= 512) {
- kAggRows<Agg, UnaryOp, BinaryOp, 256><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
- width, height, target.getLeadingDim(), agg, uop, bop);
- } else {
- kAggRows<Agg, UnaryOp, BinaryOp, 512><<<grid, threads, 0, stream>>>(getDevData(), target.getDevData(),
- width, height, target.getLeadingDim(), agg, uop, bop);
- }
-
- getLastCudaError("agg rows: Kernel execution failed");
- }
- }
- } else {
- target.applyBinary(NVMatrixBinaryOps::CompositeSecond<UnaryOp, BinaryOp>(uop, bop), *this, target, stream);
-// copy(target, stream);
- }
- }
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop) {
- _aggregate(axis, target, agg, uop, bop, getDefaultStream());
-}
-
-template<class Agg, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop) {
- _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream());
-}
-
-template<class Agg, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream) {
- _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream);
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop) {
- NVMatrix &sumVec = construct();
- _aggregate(axis, sumVec, agg, uop, bop);
- return sumVec;
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream) {
- NVMatrix &sumVec = construct();
- _aggregate(axis, sumVec, agg, uop, bop, stream);
- return sumVec;
-}
-
-template<class Agg, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop) {
- return _aggregate(axis, agg, NVMatrixOps::Identity(), bop);
-}
-
-template<class Agg, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream) {
- return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream);
-}
-
-
-
-template<class Agg, class UnaryOp, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) {
- _aggregate(axis, target, agg, uop, bop, getDefaultStream(), tmp);
-}
-
-template<class Agg, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, NVMatrix& tmp) {
- _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, getDefaultStream(), &tmp);
-}
-
-template<class Agg, class BinaryOp>
-void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
- _aggregate(axis, target, agg, NVMatrixOps::Identity(), bop, stream, &tmp);
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, NVMatrix& tmp) {
- NVMatrix &sumVec = construct();
- _aggregate(axis, sumVec, agg, uop, bop, tmp);
- return sumVec;
-}
-
-template<class Agg, class UnaryOp, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, UnaryOp uop, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
- NVMatrix &sumVec = construct();
- _aggregate(axis, sumVec, agg, uop, bop, stream, tmp);
- return sumVec;
-}
-
-template<class Agg, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, NVMatrix& tmp) {
- return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, tmp);
-}
-
-template<class Agg, class BinaryOp>
-NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp bop, cudaStream_t stream, NVMatrix& tmp) {
- return _aggregate(axis, agg, NVMatrixOps::Identity(), bop, stream, tmp);
-}
-
-void NVMatrix::inRangeInc(float lower, float upper) {
- inRangeInc(lower, upper, *this);
-}
-void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) {
- apply(NVMatrixOps::InRange<false>(lower, upper), target);
-}
-
-void NVMatrix::inRangeExc(float lower, float upper) {
- inRangeExc(lower, upper, *this);
-}
-
-void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) {
- apply(NVMatrixOps::InRange<true>(lower, upper), target);
-}
-
-void NVMatrix::biggerThanScalar(float scalar) {
- biggerThanScalar(scalar, *this);
-}
-
-void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) {
- apply(NVMatrixOps::BiggerThanScalar(scalar), target);
-}
-
-void NVMatrix::smallerThanScalar(float scalar) {
- smallerThanScalar(scalar, *this);
-}
-
-void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) {
- apply(NVMatrixOps::SmallerThanScalar(scalar), target);
-}
-
-void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) {
- apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target);
-}
-
-void NVMatrix::addScalar(float scalar, NVMatrix& target) {
- apply(NVMatrixOps::AddScalar(scalar), target);
-}
-
-void NVMatrix::addScalar(float scalar) {
- addScalar(scalar, *this);
-}
-
-void NVMatrix::minWithScalar(float scalar, NVMatrix& target) {
- apply(NVMatrixOps::MinWithScalar(scalar), target);
-}
-
-void NVMatrix::minWithScalar(float scalar) {
- minWithScalar(scalar, *this);
-}
-
-void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) {
- apply(NVMatrixOps::MaxWithScalar(scalar), target);
-}
-
-void NVMatrix::maxWithScalar(float scalar) {
- maxWithScalar(scalar, *this);
-}
-
-void NVMatrix::pow(float p, NVMatrix& target) {
- apply(NVMatrixOps::Pow(p), target);
-}
-
-void NVMatrix::pow(float p) {
- pow(p, *this);
-}
-
-void NVMatrix::scale(float _scale) {
- scale(_scale, *this);
-}
-
-void NVMatrix::scale(float _scale, cudaStream_t stream) {
- scale(_scale, *this, stream);
-}
-
-void NVMatrix::scale(float _scale, NVMatrix& target) {
- scale(_scale, target, NVMatrix::getDefaultStream());
-}
-
-void NVMatrix::scale(float _scale, NVMatrix& target, cudaStream_t stream) {
- if (_scale != 1 || &target != this) { // optimize away scale by 1
- if (_scale == 1) {
- copy(target, stream);
- } else {
- apply(NVMatrixOps::MultByScalar(_scale), target, stream);
- }
- }
-}
-
-void NVMatrix::zero() {
- apply(NVMatrixOps::Zero());
-}
-
-void NVMatrix::zero(NVMatrix& like) {
- resize(like);
- zero();
-}
-
-void NVMatrix::max(int axis, NVMatrix& target) {
- _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
-}
-
-void NVMatrix::max(int axis, NVMatrix& target, NVMatrix& tmp) {
- _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second(), tmp);
-}
-
-void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) {
- addSum(a, axis, scaleThis, scaleSum, getDefaultStream());
-}
-
-void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum, cudaStream_t stream) {
- if (scaleThis != 0) {
- a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum), stream);
- } else {
- a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum), stream);
- }
-}
-
-void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax) {
- addMax(a, axis, scaleThis, scaleMax, getDefaultStream());
-}
-
-void NVMatrix::addMax(NVMatrix& a, int axis, float scaleThis, float scaleMax, cudaStream_t stream) {
- if (scaleThis != 0) {
- a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleMax), stream);
- } else {
- a._aggregate(axis, *this, NVMatrixAggs::Max(), NVMatrixBinaryOps::SecondScaled(scaleMax), stream);
- }
-}
-
-void NVMatrix::sum(int axis, NVMatrix& target) {
- sum(axis, target, getDefaultStream());
-}
-
-void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream) {
- _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream);
-}
-
-void NVMatrix::sum(int axis, NVMatrix& target, NVMatrix& tmp) {
- sum(axis, target, getDefaultStream(), tmp);
-}
-
-void NVMatrix::sum(int axis, NVMatrix& target, cudaStream_t stream, NVMatrix& tmp) {
- _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second(), stream, tmp);
-}
-
-void NVMatrix::sumOfSquares(int axis, NVMatrix& target) {
- sumOfSquares(axis, target, getDefaultStream());
-}
-
-void NVMatrix::sumOfSquares(int axis, NVMatrix& target, cudaStream_t stream) {
- _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second(), stream);
-}
-
-void NVMatrix::min(int axis, NVMatrix& target) {
- _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
-}
-
-NVMatrix& NVMatrix::max(int axis) {
- return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
-}
-
-NVMatrix& NVMatrix::sum(int axis) {
- return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
-}
-
-NVMatrix& NVMatrix::min(int axis) {
- return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
-}
-
-NVMatrix& NVMatrix::sumOfSquares(int axis) {
- return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixOps::Square(), NVMatrixBinaryOps::Second());
-}
-
-void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads) {
- *threads = dim3(DP_BLOCKSIZE);
- *blocks = dim3(std::min(CPUSUM_MAX, DIVUP(n, DP_BLOCKSIZE)));
-}
-
-float NVMatrix::mean() {
- return sum() / getNumElements();
-}
-
-float NVMatrix::sum() {
- return _totalAgg(NVMatrixAggs::Sum());
-}
-
-float NVMatrix::sum(NVMatrix& tmpbuf) {
- return _totalAgg(NVMatrixAggs::Sum(), tmpbuf, getDefaultStream());
-}
-
-float NVMatrix::max() {
- return _totalAgg(NVMatrixAggs::Max());
-}
-
-float NVMatrix::min() {
- return _totalAgg(NVMatrixAggs::Min());
-}
-
-float NVMatrix::countNan() {
- return _totalAgg(NVMatrixAggs::CountNan());
-}
-
-float NVMatrix::countInf() {
- return _totalAgg(NVMatrixAggs::CountInf());
-}
-
-template<class Agg>
-float NVMatrix::_totalAgg(Agg agg) {
- return _totalAgg(agg, getDefaultStream());
-}
-
-template<class Agg>
-float NVMatrix::_totalAgg(Agg agg, cudaStream_t stream) {
- NVMatrix tmp;
- return _totalAgg(agg, tmp, stream);
-}
-
-template<class Agg>
-float NVMatrix::_totalAgg(Agg agg, NVMatrix& tmpbuf, cudaStream_t stream) {
- assert(isContiguous());
- dim3 blocks, threads;
- // Sum most of it on GPU
-
- _sum_setParams(getNumElements(), &blocks, &threads);
- tmpbuf.resize(1, blocks.x);
- kTotalAgg<<<blocks, threads, 0, stream>>>(getDevData(), tmpbuf.getDevData(), getNumElements(), agg);
- getLastCudaError("kTotalAgg: Kernel execution failed");
- // Don't need to sync because we copyToHost in the same stream, so it's serialized
-// NVMatrix::syncStream(stream);
- return tmpbuf.cpuAgg(agg, stream);
-}
-template<class Agg>
-float NVMatrix::cpuAgg(Agg agg, cudaStream_t stream) {
- Matrix bufCPU(getNumRows(), getNumCols());
- copyToHost(bufCPU, false, stream);
- if (getNumElements() > 1) { // Sum remainder on CPU
- if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) {
- return bufCPU.sum();
- } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) {
- return bufCPU.max();
- } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) {
- return bufCPU.min();
- } else if (typeid(Agg) == typeid(NVMatrixAggs::CountNan)) {
- return bufCPU.hasNan(); //yea, it's not the same, who cares
- } else if (typeid(Agg) == typeid(NVMatrixAggs::CountInf)) {
- return bufCPU.hasInf();
- } else {
- assert(false);
- }
- }
- return bufCPU(0,0);
-}
-
-float NVMatrix::dotProduct(NVMatrix& b) {
- return dotProduct(b, getDefaultStream());
-}
-
-float NVMatrix::dotProduct(NVMatrix& b, cudaStream_t stream) {
- NVMatrix tmp;
- return dotProduct(b, tmp, stream);
-}
-
-/*
- * Fast dot product only for matrices with same transposedness.
- */
-float NVMatrix::dotProduct(NVMatrix& b, NVMatrix& tmp, cudaStream_t stream) {
- assert(isContiguous() && b.isContiguous());
- assert(isSameDims(b));
- assert(isTrans() == b.isTrans()); // see?
- dim3 blocks, threads;
- _sum_setParams(getNumElements(), &blocks, &threads);
-// NVMatrix target(1, blocks.x);
- tmp.resize(1, blocks.x);
- kDotProduct_r<<<blocks, threads, 0, stream>>>(getDevData(), b.getDevData(), tmp.getDevData(), getNumElements());
- getLastCudaError("kDotProduct_r: Kernel execution failed");
-// cudaThreadSynchronize();
-// syncStream(stream);
-// return tmp._totalAgg(NVMatrixAggs::Sum(), stream);
- return tmp.cpuAgg(NVMatrixAggs::Sum(), stream);
-}
-
-float NVMatrix::norm2() {
- return dotProduct(*this);
-}
-
-float NVMatrix::norm() {
- return sqrt(norm2());
-}
-
-void NVMatrix::print(int startRow, int rows, int startCol, int cols) const {
-// cudaThreadSynchronize();
- syncDevice();
- Matrix hm = Matrix(_numRows, _numCols);
- copyToHost(hm);
- hm.print(startRow, rows, startCol, cols);
-}
-
-void NVMatrix::print(int rows, int cols) const {
- print(0, rows, 0, cols);
-}
-
-void NVMatrix::printShape(const char* name) const {
- printf("%s: %dx%d\n", name, _numRows, _numCols);
-}
-
-void NVMatrix::alloc(int numElements) {
- _memSegment = DEVICE_MEMORY_MANAGER::getInstance(getDeviceID()).malloc(numElements * sizeof(float));
-}
-
-void NVMatrix::dealloc() {
- DEVICE_MEMORY_MANAGER::getInstance(_memSegment->getDeviceID()).free(_memSegment);
- _memSegment = NULL;
- deallocTexture();
-}
-
-void NVMatrix::deallocTexture() {
- if (_texObj != 0) {
- checkCudaErrors(cudaDestroyTextureObject(_texObj));
- _texObj = 0;
- }
-}
-
-cudaTextureObject_t NVMatrix::getTextureObject() {
- if (_texObj == 0) {
- assert(isContiguous());
- //size_t memFree, memTotal;
-
- struct cudaResourceDesc resDesc;
- memset(&resDesc, 0, sizeof(resDesc));
- resDesc.resType = cudaResourceTypeLinear;
- resDesc.res.linear.devPtr = getDevData();
- resDesc.res.linear.sizeInBytes = getNumDataBytes();
- resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
- struct cudaTextureDesc texDesc;
- memset(&texDesc, 0, sizeof(texDesc));
- checkCudaErrors(cudaCreateTextureObject(&_texObj, &resDesc, &texDesc, NULL));
- }
- assert(_texObj != 0);
- return _texObj;
-}
-
-NVMatrix& NVMatrix::construct() const {
- return *new NVMatrix();
-}
-NVMatrix& NVMatrix::construct(bool isTrans) const {
- return *new NVMatrix(isTrans);
-}
-NVMatrix& NVMatrix::construct(int numRows, int numCols, bool isTrans) const {
- return *new NVMatrix(numRows, numCols, isTrans);
-}
-NVMatrix& NVMatrix::construct(const Matrix& like, bool copy) const {
- return *new NVMatrix(like, copy);
-}
-NVMatrix& NVMatrix::construct(const NVMatrix& like, bool copy) const {
- return *new NVMatrix(like, copy);
-}
-NVMatrix& NVMatrix::construct(const NVMatrix& like) const {
- return *new NVMatrix(like);
-}
-NVMatrix& NVMatrix::construct(const Matrix& like) const {
- return *new NVMatrix(like);
-}
-NVMatrix& NVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const {
- return *new NVMatrix(mem, numRows, numCols, stride, isTrans);
-}
-
-std::pair<size_t, size_t> NVMatrix::getCudaMemorySize() {
- size_t memFree, memTotal;
- checkCudaErrors(cudaMemGetInfo(&memFree, &memTotal));
- return std::pair<size_t,size_t>(memFree, memTotal);
-}
-
-
-/* ================
- * HostNVMatrix
- * ================
- */
-HostNVMatrix::~HostNVMatrix() {
- if (_ownsData && _numElements > 0) {
- dealloc();
- } else {
- // dealloc frees the mem segment. But if this is a view,
- // then we need to delete the mem segment object.
-// assert(_memSegment == NULL || _memSegment->getSize() == 0);
- delete _memSegment;
- }
- _deleted = true;
-}
-HostNVMatrix::HostNVMatrix() : NVMatrix() {
- _init(false);
-}
-HostNVMatrix::HostNVMatrix(bool isTrans) {
- _init(isTrans);
-}
-HostNVMatrix::HostNVMatrix(int numRows, int numCols, bool isTrans) {
- _init(isTrans);
- resize(numRows, numCols);
-}
-HostNVMatrix::HostNVMatrix(const Matrix& like, bool copy) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
- if (copy) {
- copyFromHost(like);
- }
-}
-HostNVMatrix::HostNVMatrix(const NVMatrix& like, bool copy) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
- if (copy) {
- like.copy(*this);
- }
-}
-HostNVMatrix::HostNVMatrix(const NVMatrix& like) {
- _init(like.isTrans());
- resize(like.getNumRows(), like.getNumCols());
-}
-HostNVMatrix::HostNVMatrix(const Matrix& like) {
- _init(false);
- resize(like.getNumRows(), like.getNumCols());
-}
-HostNVMatrix::HostNVMatrix(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans)
- : NVMatrix(mem, numRows, numCols, stride, isTrans) {
-}
-
-NVMatrix& HostNVMatrix::construct() const {
- return *new HostNVMatrix();
-}
-NVMatrix& HostNVMatrix::construct(bool isTrans) const {
- return *new HostNVMatrix(isTrans);
-}
-NVMatrix& HostNVMatrix::construct(int numRows, int numCols, bool isTrans) const {
- return *new HostNVMatrix(numRows, numCols, isTrans);
-}
-NVMatrix& HostNVMatrix::construct(const Matrix& like, bool copy) const {
- return *new HostNVMatrix(like, copy);
-}
-NVMatrix& HostNVMatrix::construct(const NVMatrix& like, bool copy) const {
- return *new HostNVMatrix(like, copy);
-}
-NVMatrix& HostNVMatrix::construct(const NVMatrix& like) const {
- return *new HostNVMatrix(like);
-}
-NVMatrix& HostNVMatrix::construct(const Matrix& like) const {
- return *new HostNVMatrix(like);
-}
-NVMatrix& HostNVMatrix::construct(MemorySegment* mem, int numRows, int numCols, int stride, bool isTrans) const {
- return *new HostNVMatrix(mem, numRows, numCols, stride, isTrans);
-}
-
-void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) {
- if (resizeTarget) {
- resize(hostMatrix);
- } else {
- assert(isSameDims(hostMatrix));
- }
- setTrans(hostMatrix.isTrans());
- if (getNumElements() > 0) {
- checkCudaErrors(cudaMemcpy2D(getDevData(), _stride * sizeof(float), hostMatrix.getData(),
- hostMatrix.getLeadingDim() * sizeof(float), getLeadingDim() * sizeof(float),
- getFollowingDim(), cudaMemcpyHostToHost));
-// syncStream(stream);
- }
-}
-
-void HostNVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeTarget) {
- copyFromHost(hostMatrix, resizeTarget, 0);
-}
-
-void HostNVMatrix::copyFromHost(const Matrix& hostMatrix) {
- copyFromHost(hostMatrix, false, 0);
-}
-
-void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget, cudaStream_t stream) const {
- if (resizeTarget) {
- hostMatrix.resize(getNumRows(), getNumCols());
- } else {
- assert(isSameDims(hostMatrix));
- }
- hostMatrix.setTrans(_isTrans);
- if (getNumElements() > 0) {
- checkCudaErrors(cudaMemcpy2D(hostMatrix.getData(), hostMatrix.getLeadingDim() * sizeof(float),
- getDevData(), _stride * sizeof(float), getLeadingDim() * sizeof(float),
- getFollowingDim(), cudaMemcpyHostToHost));
-// syncStream(stream);
- }
-}
-
-void HostNVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
- copyToHost(hostMatrix, resizeTarget, 0);
-}
-
-void HostNVMatrix::copyToHost(Matrix& hostMatrix) const {
- copyToHost(hostMatrix, false, 0);
-}
-
-void HostNVMatrix::alloc(int numElements) {
-// checkCudaErrors(cudaHostAlloc(&_devData, numElements * sizeof(float), cudaHostAllocPortable));
- _memSegment = HOST_MEMORY_MANAGER::getInstance().malloc(numElements * sizeof(float));
-// _memSegment = FastHostMemoryManager::getInstance().malloc(numElements * sizeof(float));
-}
-
-void HostNVMatrix::dealloc() {
-// FastHostMemoryManager::getInstance().free(_memSegment);
- HOST_MEMORY_MANAGER::getInstance().free(_memSegment);
- _memSegment = NULL;
-// checkCudaErrors(cudaFreeHost(_devData));
-}
-
-cudaTextureObject_t HostNVMatrix::getTextureObject() {
- assert(false);
- return 0;
-}
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <cuda_runtime.h>
-#include "../include/nvmatrix_kernels.cuh"
-
-__global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) {
- const int idx = blockIdx.x * blockDim.x + threadIdx.x;
- const int numThreads = blockDim.x * gridDim.x;
- // const unsigned int numEls = tgtWidth * tgtHeight;
- for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) {
- const uint y = i / tgtWidth;
- const uint x = i % tgtWidth;
- const uint srcY = y % srcHeight;
- const uint srcX = x % srcWidth;
- tgt[i] = src[srcY * srcWidth + srcX];
- }
-}
-
-__global__ void kDotProduct_r(float* a, float* b, float* target, const uint numElements) {
- __shared__ float shmem[DP_BLOCKSIZE];
-
- uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
- shmem[threadIdx.x] = 0;
- if (eidx < gridDim.x * DP_BLOCKSIZE) {
- for (; eidx < numElements; eidx += gridDim.x * DP_BLOCKSIZE) {
- shmem[threadIdx.x] += a[eidx] * b[eidx];
- }
- }
- __syncthreads();
- if (threadIdx.x < 256) {
- shmem[threadIdx.x] += shmem[threadIdx.x + 256];
- }
- __syncthreads();
- if (threadIdx.x < 128) {
- shmem[threadIdx.x] += shmem[threadIdx.x + 128];
- }
- __syncthreads();
- if (threadIdx.x < 64) {
- shmem[threadIdx.x] += shmem[threadIdx.x + 64];
- }
- __syncthreads();
- if (threadIdx.x < 32) {
- volatile float* mysh = &shmem[threadIdx.x];
- *mysh += mysh[32];
- *mysh += mysh[16];
- *mysh += mysh[8];
- *mysh += mysh[4];
- *mysh += mysh[2];
- *mysh += mysh[1];
- if (threadIdx.x == 0) {
- target[blockIdx.x] = *mysh;
- }
- }
-}
-
-__global__ void kSetupCurand(curandState *state, unsigned long long seed) {
- const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
- /* Each thread gets same seed, a different sequence number,
- no offset */
- curand_init(seed, tidx, 0, &state[tidx]);
-}
-
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import, division, print_function, unicode_literals
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as n
-from numpy.random import randn, rand, random_integers
-import os
-from threading import Thread
-from util import *
-
-BATCH_META_FILE = "batches.meta"
-
-class DataLoaderThread(Thread):
- def __init__(self, path, tgt):
- Thread.__init__(self)
- self.path = path
- self.tgt = tgt
- def run(self):
- self.tgt += [unpickle(self.path)]
-
-class DataProvider:
- BATCH_REGEX = re.compile(r'^data_batch_(\d+)(\.\d+)?$')
- def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
- if batch_range == None:
- batch_range = DataProvider.get_batch_nums(data_dir)
- if init_batchnum is None or init_batchnum not in batch_range:
- init_batchnum = batch_range[0]
-
- self.data_dir = data_dir
- self.batch_range = batch_range
- self.curr_epoch = init_epoch
- self.curr_batchnum = init_batchnum
- self.dp_params = dp_params
- self.batch_meta = self.get_batch_meta(data_dir)
- self.data_dic = None
- self.test = test
- self.batch_idx = batch_range.index(init_batchnum)
-
- def get_next_batch(self):
- if self.data_dic is None or len(self.batch_range) > 1:
- self.data_dic = self.get_batch(self.curr_batchnum)
- epoch, batchnum = self.curr_epoch, self.curr_batchnum
- self.advance_batch()
-
- return epoch, batchnum, self.data_dic
-
- def get_batch(self, batch_num):
- fname = self.get_data_file_name(batch_num)
- if os.path.isdir(fname): # batch in sub-batches
- sub_batches = sorted(os.listdir(fname), key=alphanum_key)
- #print sub_batches
- num_sub_batches = len(sub_batches)
- tgts = [[] for i in xrange(num_sub_batches)]
- threads = [DataLoaderThread(os.path.join(fname, s), tgt) for (s, tgt) in zip(sub_batches, tgts)]
- for thread in threads:
- thread.start()
- for thread in threads:
- thread.join()
-
- return [t[0] for t in tgts]
- return unpickle(self.get_data_file_name(batch_num))
-
- def get_data_dims(self,idx=0):
- return self.batch_meta['num_vis'] if idx == 0 else 1
-
- def advance_batch(self):
- self.batch_idx = self.get_next_batch_idx()
- self.curr_batchnum = self.batch_range[self.batch_idx]
- if self.batch_idx == 0: # we wrapped
- self.curr_epoch += 1
-
- def get_next_batch_idx(self):
- return (self.batch_idx + 1) % len(self.batch_range)
-
- def get_next_batch_num(self):
- return self.batch_range[self.get_next_batch_idx()]
-
- # get filename of current batch
- def get_data_file_name(self, batchnum=None):
- if batchnum is None:
- batchnum = self.curr_batchnum
- return os.path.join(self.data_dir, 'data_batch_%d' % batchnum)
-
- @classmethod
- def get_instance(cls, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, type="default", dp_params={}, test=False):
- # why the fuck can't i reference DataProvider in the original definition?
- #cls.dp_classes['default'] = DataProvider
- type = type or DataProvider.get_batch_meta(data_dir)['dp_type'] # allow data to decide data provider
- if type.startswith("dummy-"):
- name = "-".join(type.split('-')[:-1]) + "-n"
- if name not in dp_types:
- raise DataProviderException("No such data provider: %s" % type)
- _class = dp_classes[name]
- dims = int(type.split('-')[-1])
- return _class(dims)
- elif type in dp_types:
- _class = dp_classes[type]
- return _class(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
-
- raise DataProviderException("No such data provider: %s" % type)
-
- @classmethod
- def register_data_provider(cls, name, desc, _class):
- if name in dp_types:
- raise DataProviderException("Data provider %s already registered" % name)
- dp_types[name] = desc
- dp_classes[name] = _class
-
- @staticmethod
- def get_batch_meta(data_dir):
- return unpickle(os.path.join(data_dir, BATCH_META_FILE))
-
- @staticmethod
- def get_batch_filenames(srcdir):
- return sorted([f for f in os.listdir(srcdir) if DataProvider.BATCH_REGEX.match(f)], key=alphanum_key)
-
- @staticmethod
- def get_batch_nums(srcdir):
- names = DataProvider.get_batch_filenames(srcdir)
- return sorted(list(set(int(DataProvider.BATCH_REGEX.match(n).group(1)) for n in names)))
-
- @staticmethod
- def get_num_batches(srcdir):
- return len(DataProvider.get_batch_nums(srcdir))
-
-class DummyDataProvider(DataProvider):
- def __init__(self, data_dim):
- #self.data_dim = data_dim
- self.batch_range = [1]
- self.batch_meta = {'num_vis': data_dim, 'data_in_rows':True}
- self.curr_epoch = 1
- self.curr_batchnum = 1
- self.batch_idx = 0
-
- def get_next_batch(self):
- epoch, batchnum = self.curr_epoch, self.curr_batchnum
- self.advance_batch()
- data = rand(512, self.get_data_dims()).astype(n.single)
- return self.curr_epoch, self.curr_batchnum, {'data':data}
-
-class LabeledDataProvider(DataProvider):
- def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
- DataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
-
- def get_num_classes(self):
- return len(self.batch_meta['label_names'])
-
-class LabeledDummyDataProvider(DummyDataProvider):
- def __init__(self, data_dim, num_classes=10, num_cases=7):
- #self.data_dim = data_dim
- self.batch_range = [1]
- self.batch_meta = {'num_vis': data_dim,
- 'label_names': [str(x) for x in range(num_classes)],
- 'data_in_rows':True}
- self.num_cases = num_cases
- self.num_classes = num_classes
- self.curr_epoch = 1
- self.curr_batchnum = 1
- self.batch_idx=0
- self.data = None
-
- def get_num_classes(self):
- return self.num_classes
-
- def get_next_batch(self):
- epoch, batchnum = self.curr_epoch, self.curr_batchnum
- self.advance_batch()
- if self.data is None:
- data = rand(self.num_cases, self.get_data_dims()).astype(n.single) # <--changed to rand
- labels = n.require(n.c_[random_integers(0,self.num_classes-1,self.num_cases)], requirements='C', dtype=n.single)
- self.data, self.labels = data, labels
- else:
- data, labels = self.data, self.labels
-# print data.shape, labels.shape
- return self.curr_epoch, self.curr_batchnum, [data.T, labels.T ]
-
-
-dp_types = {"dummy-n": "Dummy data provider for n-dimensional data",
- "dummy-labeled-n": "Labeled dummy data provider for n-dimensional data"}
-dp_classes = {"dummy-n": DummyDataProvider,
- "dummy-labeled-n": LabeledDummyDataProvider}
-
-class DataProviderException(Exception):
- pass
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as n
-import os
-from time import time, asctime, localtime, strftime
-from util import *
-from data import *
-from options import *
-from math import ceil, floor, sqrt
-from data import DataProvider, dp_types
-import sys
-import shutil
-import platform
-from os import linesep as NL
-from threading import Thread
-import tempfile as tf
-
-class ModelStateException(Exception):
- pass
-
-class CheckpointWriter(Thread):
- def __init__(self, path, dic):
- Thread.__init__(self)
- self.path = path
- self.dic = dic
-
- def run(self):
- save_dir = os.path.dirname(self.path)
- save_file = os.path.basename(self.path)
- # Write checkpoint to temporary filename
- tmpfile = tf.NamedTemporaryFile(dir=os.path.dirname(save_dir), delete=False)
- pickle(tmpfile, self.dic) # Also closes tf
- # Move it to final filename
- os.rename(tmpfile.name, self.path)
- # Delete old checkpoints
- for f in os.listdir(save_dir):
- if f != save_file:
- os.remove(os.path.join(save_dir, f))
-
-# GPU Model interface
-class IGPUModel:
- def __init__(self, model_name, op, load_dic, filename_options=[], dp_params={}):
- # these are input parameters
- self.model_name = model_name
- self.op = op
- self.options = op.options
- self.load_dic = load_dic
- self.filename_options = filename_options
- self.dp_params = dp_params
- self.device_ids = self.op.get_value('gpu')
- self.fill_excused_options()
- self.checkpoint_writer = None
- #assert self.op.all_values_given()
-
- for o in op.get_options_list():
- setattr(self, o.name, o.value)
- self.loaded_from_checkpoint = load_dic is not None
- # these are things that the model must remember but they're not input parameters
- if self.loaded_from_checkpoint:
- self.model_state = load_dic["model_state"]
- self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else self.options['load_file'].value
- if not os.path.isdir(self.save_file) and os.path.exists(self.save_file):
- self.save_file = os.path.dirname(self.save_file)
-# print self.options["save_file_override"].value, self.save_file
- else:
- self.model_state = {}
- self.save_file = self.options["save_file_override"].value if self.options["save_file_override"].value_given else os.path.join(self.options['save_path'].value, model_name + "_" + '_'.join(['%s_%s' % (char, self.options[opt].get_str_value()) for opt, char in filename_options]) + '_' + strftime('%Y-%m-%d_%H.%M.%S'))
- self.model_state["train_outputs"] = []
- self.model_state["test_outputs"] = []
- self.model_state["epoch"] = 1
- self.model_state["batchnum"] = self.train_batch_range[0]
-# print self.save_file
-
- self.init_data_providers()
- if load_dic:
- self.train_data_provider.advance_batch()
-
- # model state often requries knowledge of data provider, so it's initialized after
- try:
- self.init_model_state()
- except ModelStateException, e:
- print e
- sys.exit(1)
- for var, val in self.model_state.iteritems():
- setattr(self, var, val)
-
- self.import_model()
- self.init_model_lib()
-
- def import_model(self):
- print "========================="
- print "Importing %s C++ module" % ('_' + self.model_name)
- self.libmodel = __import__('_' + self.model_name)
-
- def fill_excused_options(self):
- pass
-
- def init_data_providers(self):
- self.dp_params['convnet'] = self
- try:
- self.test_data_provider = DataProvider.get_instance(self.data_path, self.test_batch_range,
- type=self.dp_type, dp_params=self.dp_params, test=True)
- self.train_data_provider = DataProvider.get_instance(self.data_path, self.train_batch_range,
- self.model_state["epoch"], self.model_state["batchnum"],
- type=self.dp_type, dp_params=self.dp_params, test=False)
- except DataProviderException, e:
- print "Unable to create data provider: %s" % e
- self.print_data_providers()
- sys.exit()
-
- def init_model_state(self):
- pass
-
- def init_model_lib(self):
- pass
-
- def start(self):
- if self.test_only:
- self.test_outputs += [self.get_test_error()]
- self.print_test_results()
- else:
- self.train()
- self.cleanup()
- if self.force_save:
- self.save_state().join()
- sys.exit(0)
-
- def train(self):
- print "========================="
- print "Training %s" % self.model_name
- self.op.print_values()
- print "========================="
- self.print_model_state()
- print "Running on CUDA device(s) %s" % ", ".join("%d" % d for d in self.device_ids)
- print "Current time: %s" % asctime(localtime())
- print "Saving checkpoints to %s" % self.save_file
- print "========================="
- next_data = self.get_next_batch()
- while self.epoch <= self.num_epochs:
- data = next_data
- self.epoch, self.batchnum = data[0], data[1]
- self.print_iteration()
- sys.stdout.flush()
-
- compute_time_py = time()
- self.start_batch(data)
-
- # load the next batch while the current one is computing
- next_data = self.get_next_batch()
-
- batch_output = self.finish_batch()
- self.train_outputs += [batch_output]
- self.print_train_results()
-
- if self.get_num_batches_done() % self.testing_freq == 0:
- self.sync_with_host()
- self.test_outputs += [self.get_test_error()]
- self.print_test_results()
- self.print_test_status()
- self.conditional_save()
-
- self.print_elapsed_time(time() - compute_time_py)
-
- def cleanup(self):
- if self.checkpoint_writer is not None:
- self.checkpoint_writer.join()
- self.checkpoint_writer = None
-
- def print_model_state(self):
- pass
-
- def get_num_batches_done(self):
- return len(self.train_batch_range) * (self.epoch - 1) + self.batchnum - self.train_batch_range[0] + 1
-
- def get_next_batch(self, train=True):
- dp = self.train_data_provider
- if not train:
- dp = self.test_data_provider
- return self.parse_batch_data(dp.get_next_batch(), train=train)
-
- def parse_batch_data(self, batch_data, train=True):
- return batch_data[0], batch_data[1], batch_data[2]['data']
-
- def start_batch(self, batch_data, train=True):
- self.libmodel.startBatch(batch_data[2], not train)
-
- def finish_batch(self):
- return self.libmodel.finishBatch()
-
- def print_iteration(self):
- print "\t%d.%d..." % (self.epoch, self.batchnum),
-
- def print_elapsed_time(self, compute_time_py):
- print "(%.3f sec)" % (compute_time_py)
-
- def print_train_results(self):
- batch_error = self.train_outputs[-1][0]
- if not (batch_error > 0 and batch_error < 2e20):
- print "Crazy train error: %.6f" % batch_error
- self.cleanup()
-
- print "Train error: %.6f " % (batch_error),
-
- def print_test_results(self):
- batch_error = self.test_outputs[-1][0]
- print "%s\t\tTest error: %.6f" % (NL, batch_error),
-
- def print_test_status(self):
- status = (len(self.test_outputs) == 1 or self.test_outputs[-1][0] < self.test_outputs[-2][0]) and "ok" or "WORSE"
- print status,
-
- def sync_with_host(self):
- if self.checkpoint_writer is not None:
- self.checkpoint_writer.join()
- self.checkpoint_writer = None
- self.libmodel.syncWithHost()
-
- def conditional_save(self):
- batch_error = self.test_outputs[-1][0]
- if batch_error > 0 and batch_error < self.max_test_err:
- self.save_state()
- else:
- print "\tTest error > %g, not saving." % self.max_test_err,
-
- def aggregate_test_outputs(self, test_outputs):
- test_error = tuple([sum(t[r] for t in test_outputs) / (1 if self.test_one else len(self.test_batch_range)) for r in range(len(test_outputs[-1]))])
- return test_error
-
- def get_test_error(self):
- next_data = self.get_next_batch(train=False)
- test_outputs = []
- while True:
- data = next_data
- start_time_test = time()
- self.start_batch(data, train=False)
- load_next = (not self.test_one or self.test_only) and data[1] < self.test_batch_range[-1]
- if load_next: # load next batch
- next_data = self.get_next_batch(train=False)
- test_outputs += [self.finish_batch()]
- if self.test_only: # Print the individual batch results for safety
- print "batch %d: %s" % (data[1], str(test_outputs[-1])),
- self.print_elapsed_time(time() - start_time_test)
- if not load_next:
- break
- sys.stdout.flush()
-
- return self.aggregate_test_outputs(test_outputs)
-
- def set_var(self, var_name, var_val):
- setattr(self, var_name, var_val)
- self.model_state[var_name] = var_val
- return var_val
-
- def get_var(self, var_name):
- return self.model_state[var_name]
-
- def has_var(self, var_name):
- return var_name in self.model_state
-
- def save_state(self):
- for att in self.model_state:
- if hasattr(self, att):
- self.model_state[att] = getattr(self, att)
-
- dic = {"model_state": self.model_state,
- "op": self.op}
-
- checkpoint_file = "%d.%d" % (self.epoch, self.batchnum)
- checkpoint_file_full_path = os.path.join(self.save_file, checkpoint_file)
- if not os.path.exists(self.save_file):
- os.makedirs(self.save_file)
-
- assert self.checkpoint_writer is None
- self.checkpoint_writer = CheckpointWriter(checkpoint_file_full_path, dic)
- self.checkpoint_writer.start()
- print "-------------------------------------------------------"
- print "Saved checkpoint to %s" % self.save_file
- print "=======================================================",
- return self.checkpoint_writer
-
- def get_progress(self):
- num_batches_total = self.num_epochs * len(self.train_batch_range)
- return min(1.0, max(0.0, float(self.get_num_batches_done()-1) / num_batches_total))
-
- @staticmethod
- def load_checkpoint(load_dir):
- if os.path.isdir(load_dir):
- return unpickle(os.path.join(load_dir, sorted(os.listdir(load_dir), key=alphanum_key)[-1]))
- return unpickle(load_dir)
-
- @staticmethod
- def get_options_parser():
- op = OptionsParser()
- op.add_option("load-file", "load_file", StringOptionParser, "Load file", default="", excuses=OptionsParser.EXCUSE_ALL)
- op.add_option("save-path", "save_path", StringOptionParser, "Save path", excuses=['save_file_override'])
- op.add_option("save-file", "save_file_override", StringOptionParser, "Save file override", excuses=['save_path'])
- op.add_option("train-range", "train_batch_range", RangeOptionParser, "Data batch range: training")
- op.add_option("test-range", "test_batch_range", RangeOptionParser, "Data batch range: testing")
- op.add_option("data-provider", "dp_type", StringOptionParser, "Data provider", default="default")
- op.add_option("test-freq", "testing_freq", IntegerOptionParser, "Testing frequency", default=25)
- op.add_option("epochs", "num_epochs", IntegerOptionParser, "Number of epochs", default=500)
- op.add_option("data-path", "data_path", StringOptionParser, "Data path")
-
- op.add_option("max-test-err", "max_test_err", FloatOptionParser, "Maximum test error for saving")
- op.add_option("test-only", "test_only", BooleanOptionParser, "Test and quit?", default=0)
- op.add_option("test-one", "test_one", BooleanOptionParser, "Test on one batch at a time?", default=1)
- op.add_option("force-save", "force_save", BooleanOptionParser, "Force save before quitting", default=0)
- op.add_option("gpu", "gpu", ListOptionParser(IntegerOptionParser), "GPU override")
- return op
-
- @staticmethod
- def print_data_providers():
- print "Available data providers:"
- for dp, desc in dp_types.iteritems():
- print " %s: %s" % (dp, desc)
-
-
- @staticmethod
- def parse_options(op):
- try:
- load_dic = None
- options = op.parse()
- load_location = None
-# print options['load_file'].value_given, options['save_file_override'].value_given
-# print options['save_file_override'].value
- if options['load_file'].value_given:
- load_location = options['load_file'].value
- elif options['save_file_override'].value_given and os.path.exists(options['save_file_override'].value):
- load_location = options['save_file_override'].value
-
- if load_location is not None:
- load_dic = IGPUModel.load_checkpoint(load_location)
- old_op = load_dic["op"]
- old_op.merge_from(op)
- op = old_op
- op.eval_expr_defaults()
- return op, load_dic
- except OptionMissingException, e:
- print e
- op.print_usage()
- except OptionException, e:
- print e
- except UnpickleError, e:
- print "Error loading checkpoint:"
- print e
- sys.exit()
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-from getopt import getopt
-import os
-import re
-#import types
-
-TERM_BOLD_START = "\033[1m"
-TERM_BOLD_END = "\033[0m"
-
-class Option:
- def __init__(self, letter, name, desc, parser, set_once, default, excuses, requires, save):
- assert not name is None
- self.letter = letter
- self.name = name
- self.desc = desc
- self.parser = parser
- self.set_once = set_once
- self.default = default
- self.excuses = excuses
- self.requires = requires
- self.save = save
-
- self.value = None
- self.value_given = False
- self.prefixed_letter = min(2, len(letter)) * '-' + letter
-
- def set_value(self, value, parse=True):
- try:
- self.value = self.parser.parse(value) if parse else value
- self.value_given = True
-# print self.name, self.value
- except OptionException, e:
- raise OptionException("Unable to parse option %s (%s): %s" % (self.prefixed_letter, self.desc, e))
-
- def set_default(self):
- if not self.default is None:
- self.value = self.default
-
- def eval_expr_default(self, env):
- try:
- if isinstance(self.default, OptionExpression) and not self.value_given:
- self.value = self.default.evaluate(env)
- if not self.parser.is_type(self.value):
- raise OptionException("expression result %s is not of right type (%s)" % (self.value, self.parser.get_type_str()))
- except Exception, e:
- raise OptionException("Unable to set default value for option %s (%s): %s" % (self.prefixed_letter, self.desc, e))
-
- def get_str_value(self, get_default_str=False):
- val = self.value
- if get_default_str: val = self.default
- if val is None: return ""
- if isinstance(val, OptionExpression):
- return val.expr
- return self.parser.to_string(val)
-
-class OptionsParser:
- """An option parsing class. All options without default values are mandatory, unless a excuses
- option (usually a load file) is given.
- Does not support options without arguments."""
- SORT_LETTER = 1
- SORT_DESC = 2
- SORT_EXPR_LAST = 3
- EXCUSE_ALL = "all"
- def __init__(self):
- self.options = {}
-
- def add_option(self, letter, name, parser, desc, set_once=False, default=None, excuses=[], requires=[], save=True):
- """
- The letter parameter is the actual parameter that the user will have to supply on the command line.
- The name parameter is some name to be given to this option and must be a valid python variable name.
-
- An explanation of the "default" parameter:
- The default value, if specified, should have the same type as the option.
- You can also specify an expression as the default value. In this case, the default value of the parameter
- will be the output of the expression. The expression may assume all other option names
- as local variables. For example, you can define the hidden bias
- learning rate to be 10 times the weight learning rate by setting this default:
-
- default=OptionExpression("eps_w * 10") (assuming an option named eps_w exists).
-
- However, it is up to you to make sure you do not make any circular expression definitions.
-
- Note that the order in which the options are parsed is arbitrary.
- In particular, expression default values that depend on other expression default values
- will often raise errors (depending on the order in which they happen to be parsed).
- Therefore it is best not to make the default value of one variable depend on the value
- of another if the other variable's default value is itself an expression.
-
- An explanation of the "excuses" parameter:
- All options are mandatory, but certain options can exclude other options from being mandatory.
- For example, if the excuses parameter for option "load_file" is ["num_hid", "num_vis"],
- then the options num_hid and num_vis are not mandatory as long as load_file is specified.
- Use the special flag EXCUSE_ALL to allow an option to make all other options optional.
- """
-
- assert name not in self.options
- self.options[name] = Option(letter, name, desc, parser, set_once, default, excuses, requires, save)
-
- def set_value(self, name, value, parse=True):
- self.options[name].set_value(value, parse=parse)
-
- def get_value(self, name):
- return self.options[name].value
-
- def delete_option(self, name):
- if name in self.options:
- del self.options[name]
-
- def parse(self, eval_expr_defaults=False):
- """Parses the options in sys.argv based on the options added to this parser. The
- default behavior is to leave any expression default options as OptionExpression objects.
- Set eval_expr_defaults=True to circumvent this."""
- short_opt_str = ''.join(["%s:" % self.options[name].letter for name in self.options if len(self.options[name].letter) == 1])
- long_opts = ["%s=" % self.options[name].letter for name in self.options if len(self.options[name].letter) > 1]
- (go, ga) = getopt(sys.argv[1:], short_opt_str, longopts=long_opts)
- dic = dict(go)
-
- for o in self.get_options_list(sort_order=self.SORT_EXPR_LAST):
- if o.prefixed_letter in dic:
- o.set_value(dic[o.prefixed_letter])
- else:
- # check if excused or has default
- excused = max([o2.prefixed_letter in dic for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses])
- if not excused and o.default is None:
- raise OptionMissingException("Option %s (%s) not supplied" % (o.prefixed_letter, o.desc))
- o.set_default()
- # check requirements
- if o.prefixed_letter in dic:
- for o2 in self.get_options_list(sort_order=self.SORT_LETTER):
- if o2.name in o.requires and o2.prefixed_letter not in dic:
- raise OptionMissingException("Option %s (%s) requires option %s (%s)" % (o.prefixed_letter, o.desc,
- o2.prefixed_letter, o2.desc))
- if eval_expr_defaults:
- self.eval_expr_defaults()
- return self.options
-
- def merge_from(self, op2):
- """Merges the options in op2 into this instance, but does not overwrite
- this instances's SET options with op2's default values."""
- for name, o in self.options.iteritems():
- if name in op2.options and ((op2.options[name].value_given and op2.options[name].value != self.options[name].value) or not op2.options[name].save):
- if op2.options[name].set_once:
- raise OptionException("Option %s (%s) cannot be changed" % (op2.options[name].prefixed_letter, op2.options[name].desc))
- self.options[name] = op2.options[name]
- for name in op2.options:
- if name not in self.options:
- self.options[name] = op2.options[name]
-
- def eval_expr_defaults(self):
- env = dict([(name, o.value) for name, o in self.options.iteritems()])
- for o in self.options.values():
- o.eval_expr_default(env)
-
- def all_values_given(self):
- return max([o.value_given for o in self.options.values() if o.default is not None])
-
- def get_options_list(self, sort_order=SORT_LETTER):
- """ Returns the list of Option objects in this OptionParser,
- sorted as specified"""
-
- cmp = lambda x, y: (x.desc < y.desc and -1 or 1)
- if sort_order == self.SORT_LETTER:
- cmp = lambda x, y: (x.letter < y.letter and -1 or 1)
- elif sort_order == self.SORT_EXPR_LAST:
- cmp = lambda x, y: (type(x.default) == OptionExpression and 1 or -1)
- return sorted(self.options.values(), cmp=cmp)
-
- def print_usage(self, print_constraints=False):
- print "%s usage:" % os.path.basename(sys.argv[0])
- opslist = self.get_options_list()
-
- usage_strings = []
- num_def = 0
- for o in opslist:
- excs = ' '
- if o.default is None:
- excs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.excuses == self.EXCUSE_ALL or o.name in o2.excuses]))
- reqs = ', '.join(sorted([o2.prefixed_letter for o2 in self.options.values() if o2.name in o.requires]))
- usg = (OptionsParser._bold(o.prefixed_letter) + " <%s>" % o.parser.get_type_str(), o.desc, ("[%s]" % o.get_str_value(get_default_str=True)) if not o.default is None else None, excs, reqs)
- if o.default is None:
- usage_strings += [usg]
- else:
- usage_strings.insert(num_def, usg)
- num_def += 1
-
- col_widths = [self._longest_value(usage_strings, key=lambda x:x[i]) for i in range(len(usage_strings[0]) - 1)]
-
- col_names = [" Option", "Description", "Default"]
- if print_constraints:
- col_names += ["Excused by", "Requires"]
- for i, s in enumerate(col_names):
- print self._bold(s.ljust(col_widths[i])),
-
- print ""
- for l, d, de, ex, req in usage_strings:
- if de is None:
- de = ' '
- print (" %s -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]),
- else:
- print (" [%s] -" % l.ljust(col_widths[0])), d.ljust(col_widths[1]), de.ljust(col_widths[2]),
- if print_constraints:
- print ex.ljust(col_widths[3]), req
- else:
- print ""
-
- def print_values(self):
- longest_desc = self._longest_value(self.options.values(), key=lambda x:x.desc)
- longest_def_value = self._longest_value([v for v in self.options.values() if not v.value_given and not v.default is None],
- key=lambda x:x.get_str_value())
- for o in self.get_options_list(sort_order=self.SORT_DESC):
- print "%s: %s %s" % (o.desc.ljust(longest_desc), o.get_str_value().ljust(longest_def_value), (not o.value_given and not o.default is None) and "[DEFAULT]" or "")
-
- @staticmethod
- def _longest_value(values, key=lambda x:x):
- mylen = lambda x: 0 if x is None else len(x)
- return mylen(key(max(values, key=lambda x:mylen(key(x)))))
-
- @staticmethod
- def _bold(str):
- return TERM_BOLD_START + str + TERM_BOLD_END
-
-class OptionException(Exception):
- pass
-
-class OptionMissingException(OptionException):
- pass
-
-class OptionParser:
- @staticmethod
- def parse(value):
- return str(value)
-
- @staticmethod
- def to_string(value):
- return str(value)
-
- @staticmethod
- def get_type_str():
- pass
-
-class IntegerOptionParser(OptionParser):
- @staticmethod
- def parse(value):
- try:
- return int(value)
- except:
- raise OptionException("argument is not an integer")
-
- @staticmethod
- def get_type_str():
- return "int"
-
- @staticmethod
- def is_type(value):
- return type(value) == int
-
-class BooleanOptionParser(OptionParser):
- @staticmethod
- def parse(value):
- try:
- v = int(value)
- if not v in (0,1):
- raise OptionException
- return v
- except:
- raise OptionException("argument is not a boolean")
-
- @staticmethod
- def get_type_str():
- return "0/1"
-
- @staticmethod
- def is_type(value):
- return type(value) == int and value in (0, 1)
-
-class StringOptionParser(OptionParser):
- @staticmethod
- def get_type_str():
- return "string"
-
- @staticmethod
- def is_type(value):
- return type(value) == str
-
-class FloatOptionParser(OptionParser):
- @staticmethod
- def parse(value):
- try:
- return float(value)
- except:
- raise OptionException("argument is not a float")
-
- @staticmethod
- def to_string(value):
- return "%.6g" % value
-
- @staticmethod
- def get_type_str():
- return "float"
-
- @staticmethod
- def is_type(value):
- return type(value) == float
-
-class RangeOptionParser(OptionParser):
- @staticmethod
- def parse(value):
- m = re.match("^(\d+)\-(\d+)$", value)
- try:
- if m: return range(int(m.group(1)), int(m.group(2)) + 1)
- return [int(value)]
- except:
- raise OptionException("argument is neither an integer nor a range")
-
- @staticmethod
- def to_string(value):
- return "%d-%d" % (value[0], value[-1])
-
- @staticmethod
- def get_type_str():
- return "int[-int]"
-
- @staticmethod
- def is_type(value):
- return type(value) == list
-
-class ListOptionParser(OptionParser):
- """
- A parser that parses a delimited list of items. If the "parsers"
- argument is a list of parsers, then the list of items must have the form and length
- specified by that list.
-
- Example:
- ListOptionParser([FloatOptionParser, IntegerOptionParser])
-
- would parse "0.5,3" but not "0.5,3,0.6" or "0.5" or "3,0.5".
-
- If the "parsers" argument is another parser, then the list of items may be of
- arbitrary length, but each item must be parseable by the given parser.
-
- Example:
- ListOptionParser(FloatOptionParser)
-
- would parse "0.5" and "0.5,0.3" and "0.5,0.3,0.6", etc.
- """
- def __init__(self, parsers, sepchar=','):
- self.parsers = parsers
- self.sepchar = sepchar
-
- def parse(self, value):
- values = value.split(self.sepchar)
- if type(self.parsers) == list and len(values) != len(self.parsers):
- raise OptionException("requires %d arguments, given %d" % (len(self.parsers), len(values)))
-
- try:
- if type(self.parsers) == list:
- return [p.parse(v) for p, v in zip(self.parsers, values)]
- return [self.parsers.parse(v) for v in values]
- except:
- raise OptionException("argument is not of the form %s" % self.get_type_str())
-
- def to_string(self, value):
- if type(self.parsers) == list:
- return self.sepchar.join([p.to_string(v) for p, v in zip(self.parsers, value)])
- return self.sepchar.join([self.parsers.to_string(v) for v in value])
-
- def get_type_str(self):
- if type(self.parsers) == list:
- return self.sepchar.join([p.get_type_str() for p in self.parsers])
- return "%s%s..." % (self.parsers.get_type_str(), self.sepchar)
-
- @staticmethod
- def is_type(value):
- return type(value) == list
-
-class OptionExpression:
- """
- This allows you to specify option values in terms of other option values.
- Example:
- op.add_option("eps-w", "eps_w", ListOptionParser(FloatOptionParser), "Weight learning rates for each layer")
- op.add_option("eps-b", "eps_b", ListOptionParser(FloatOptionParser), "Bias learning rates for each layer", default=OptionExpression("[o * 10 for o in eps_w]"))
-
- This says: the default bias learning rate for each layer is 10
- times the weight learning rate for that layer.
- """
- def __init__(self, expr):
- self.expr = expr
-
- def evaluate(self, options):
- locals().update(options)
- try:
- return eval(self.expr)
- except Exception, e:
- raise OptionException("expression '%s': unable to parse: %s" % (self.expr, e))
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import cPickle
-import os
-from cStringIO import StringIO
-
-class UnpickleError(Exception):
- pass
-
-GPU_LOCK_NO_SCRIPT = -2
-GPU_LOCK_NO_LOCK = -1
-
-def pickle(filename, data):
- fo = filename
- if type(filename) == str:
- fo = open(filename, "w")
-
- with fo:
- cPickle.dump(data, fo, protocol=cPickle.HIGHEST_PROTOCOL)
-
-def unpickle(filename):
- if not os.path.exists(filename):
- raise UnpickleError("Path '%s' does not exist." % filename)
-
- with open(filename) as fo, StringIO() as z:
- file_size = os.fstat(fo.fileno()).st_size
- # Read 1GB at a time to avoid overflow
- while fo.tell() < file_size:
- z.write(fo.read(1 << 30))
- return cPickle.loads(z.getvalue())
-
-def is_intel_machine():
- VENDOR_ID_REGEX = re.compile(r'^vendor_id\s+: (\S+)')
- with open('/proc/cpuinfo') as f:
- for line in f:
- m = VENDOR_ID_REGEX.match(line)
- if m:
- return m.group(1) == 'GenuineIntel'
- return False
-
-# Returns the CPUs associated with a given GPU
-def get_cpus_for_gpu(gpu):
- #proc = subprocess.Popen(['nvidia-smi', '-q', '-i', str(gpu)], stdout=subprocess.PIPE)
- #lines = proc.communicate()[0]
- #lines = subprocess.check_output(['nvidia-smi', '-q', '-i', str(gpu)]).split(os.linesep)
-
- with open('/proc/driver/nvidia/gpus/%d/information' % gpu) as f:
- for line in f:
- if line.startswith('Bus Location'):
- bus_id = line.split(':', 1)[1].strip()
- bus_id = bus_id[:7] + ':' + bus_id[8:]
- with open('/sys/module/nvidia/drivers/pci:nvidia/%s/local_cpulist' % bus_id) as ff:
- cpus_str = ff.readline()
- cpus = [cpu for s in cpus_str.split(',') for cpu in range(int(s.split('-')[0]),int(s.split('-')[1])+1)]
- return cpus
- return [-1]
-
-def get_cpu():
- if is_intel_machine():
- return 'intel'
- return 'amd'
-
-def is_windows_machine():
- return os.name == 'nt'
-
-def tryint(s):
- try:
- return int(s)
- except:
- return s
-
-def alphanum_key(s):
- return [tryint(c) for c in re.split('([0-9]+)', s)]
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-from tarfile import TarFile, TarInfo
-from matplotlib import pylab as pl
-import numpy as n
-import getopt as opt
-from python_util.util import *
-from math import sqrt, ceil, floor
-from python_util.gpumodel import IGPUModel
-import random as r
-import numpy.random as nr
-from convnet import ConvNet
-from python_util.options import *
-from PIL import Image
-from time import sleep
-
-class ShowNetError(Exception):
- pass
-
-class ShowConvNet(ConvNet):
- def __init__(self, op, load_dic):
- ConvNet.__init__(self, op, load_dic)
-
- def init_data_providers(self):
- self.need_gpu = self.op.get_value('show_preds')
- class Dummy:
- def advance_batch(self):
- pass
- if self.need_gpu:
- ConvNet.init_data_providers(self)
- else:
- self.train_data_provider = self.test_data_provider = Dummy()
-
- def import_model(self):
- if self.need_gpu:
- ConvNet.import_model(self)
-
- def init_model_state(self):
- if self.op.get_value('show_preds'):
- self.softmax_name = self.op.get_value('show_preds')
-
- def init_model_lib(self):
- if self.need_gpu:
- ConvNet.init_model_lib(self)
-
- def plot_cost(self):
- if self.show_cost not in self.train_outputs[0][0]:
- raise ShowNetError("Cost function with name '%s' not defined by given convnet." % self.show_cost)
-# print self.test_outputs
- train_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.train_outputs]
- test_errors = [eval(self.layers[self.show_cost]['outputFilter'])(o[0][self.show_cost], o[1])[self.cost_idx] for o in self.test_outputs]
- if self.smooth_test_errors:
- test_errors = [sum(test_errors[max(0,i-len(self.test_batch_range)):i])/(i-max(0,i-len(self.test_batch_range))) for i in xrange(1,len(test_errors)+1)]
- numbatches = len(self.train_batch_range)
- test_errors = n.row_stack(test_errors)
- test_errors = n.tile(test_errors, (1, self.testing_freq))
- test_errors = list(test_errors.flatten())
- test_errors += [test_errors[-1]] * max(0,len(train_errors) - len(test_errors))
- test_errors = test_errors[:len(train_errors)]
-
- numepochs = len(train_errors) / float(numbatches)
- pl.figure(1)
- x = range(0, len(train_errors))
- pl.plot(x, train_errors, 'k-', label='Training set')
- pl.plot(x, test_errors, 'r-', label='Test set')
- pl.legend()
- ticklocs = range(numbatches, len(train_errors) - len(train_errors) % numbatches + 1, numbatches)
- epoch_label_gran = int(ceil(numepochs / 20.))
- epoch_label_gran = int(ceil(float(epoch_label_gran) / 10) * 10) if numepochs >= 10 else epoch_label_gran
- ticklabels = map(lambda x: str((x[1] / numbatches)) if x[0] % epoch_label_gran == epoch_label_gran-1 else '', enumerate(ticklocs))
-
- pl.xticks(ticklocs, ticklabels)
- pl.xlabel('Epoch')
-# pl.ylabel(self.show_cost)
- pl.title('%s[%d]' % (self.show_cost, self.cost_idx))
-# print "plotted cost"
-
- def make_filter_fig(self, filters, filter_start, fignum, _title, num_filters, combine_chans, FILTERS_PER_ROW=16):
- MAX_ROWS = 24
- MAX_FILTERS = FILTERS_PER_ROW * MAX_ROWS
- num_colors = filters.shape[0]
- f_per_row = int(ceil(FILTERS_PER_ROW / float(1 if combine_chans else num_colors)))
- filter_end = min(filter_start+MAX_FILTERS, num_filters)
- filter_rows = int(ceil(float(filter_end - filter_start) / f_per_row))
-
- filter_pixels = filters.shape[1]
- filter_size = int(sqrt(filters.shape[1]))
- fig = pl.figure(fignum)
- fig.text(.5, .95, '%s %dx%d filters %d-%d' % (_title, filter_size, filter_size, filter_start, filter_end-1), horizontalalignment='center')
- num_filters = filter_end - filter_start
- if not combine_chans:
- bigpic = n.zeros((filter_size * filter_rows + filter_rows + 1, filter_size*num_colors * f_per_row + f_per_row + 1), dtype=n.single)
- else:
- bigpic = n.zeros((3, filter_size * filter_rows + filter_rows + 1, filter_size * f_per_row + f_per_row + 1), dtype=n.single)
-
- for m in xrange(filter_start,filter_end ):
- filter = filters[:,:,m]
- y, x = (m - filter_start) / f_per_row, (m - filter_start) % f_per_row
- if not combine_chans:
- for c in xrange(num_colors):
- filter_pic = filter[c,:].reshape((filter_size,filter_size))
- bigpic[1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size,
- 1 + (1 + filter_size*num_colors) * x + filter_size*c:1 + (1 + filter_size*num_colors) * x + filter_size*(c+1)] = filter_pic
- else:
- filter_pic = filter.reshape((3, filter_size,filter_size))
- bigpic[:,
- 1 + (1 + filter_size) * y:1 + (1 + filter_size) * y + filter_size,
- 1 + (1 + filter_size) * x:1 + (1 + filter_size) * x + filter_size] = filter_pic
-
- pl.xticks([])
- pl.yticks([])
- if not combine_chans:
- pl.imshow(bigpic, cmap=pl.cm.gray, interpolation='nearest')
- else:
- bigpic = bigpic.swapaxes(0,2).swapaxes(0,1)
- pl.imshow(bigpic, interpolation='nearest')
-
- def plot_filters(self):
- FILTERS_PER_ROW = 16
- filter_start = 0 # First filter to show
- if self.show_filters not in self.layers:
- raise ShowNetError("Layer with name '%s' not defined by given convnet." % self.show_filters)
- layer = self.layers[self.show_filters]
- filters = layer['weights'][self.input_idx]
-# filters = filters - filters.min()
-# filters = filters / filters.max()
- if layer['type'] == 'fc': # Fully-connected layer
- num_filters = layer['outputs']
- channels = self.channels
- filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1])
- elif layer['type'] in ('conv', 'local'): # Conv layer
- num_filters = layer['filters']
- channels = layer['filterChannels'][self.input_idx]
- if layer['type'] == 'local':
- filters = filters.reshape((layer['modules'], channels, layer['filterPixels'][self.input_idx], num_filters))
- filters = filters[:, :, :, self.local_plane] # first map for now (modules, channels, pixels)
- filters = filters.swapaxes(0,2).swapaxes(0,1)
- num_filters = layer['modules']
-# filters = filters.swapaxes(0,1).reshape(channels * layer['filterPixels'][self.input_idx], num_filters * layer['modules'])
-# num_filters *= layer['modules']
- FILTERS_PER_ROW = layer['modulesX']
- else:
- filters = filters.reshape(channels, filters.shape[0]/channels, filters.shape[1])
-
-
- # Convert YUV filters to RGB
- if self.yuv_to_rgb and channels == 3:
- R = filters[0,:,:] + 1.28033 * filters[2,:,:]
- G = filters[0,:,:] + -0.21482 * filters[1,:,:] + -0.38059 * filters[2,:,:]
- B = filters[0,:,:] + 2.12798 * filters[1,:,:]
- filters[0,:,:], filters[1,:,:], filters[2,:,:] = R, G, B
- combine_chans = not self.no_rgb and channels == 3
-
- # Make sure you don't modify the backing array itself here -- so no -= or /=
- if self.norm_filters:
- #print filters.shape
- filters = filters - n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).mean(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1))
- filters = filters / n.sqrt(n.tile(filters.reshape((filters.shape[0] * filters.shape[1], filters.shape[2])).var(axis=0).reshape(1, 1, filters.shape[2]), (filters.shape[0], filters.shape[1], 1)))
- #filters = filters - n.tile(filters.min(axis=0).min(axis=0), (3, filters.shape[1], 1))
- #filters = filters / n.tile(filters.max(axis=0).max(axis=0), (3, filters.shape[1], 1))
- #else:
- filters = filters - filters.min()
- filters = filters / filters.max()
-
- self.make_filter_fig(filters, filter_start, 2, 'Layer %s' % self.show_filters, num_filters, combine_chans, FILTERS_PER_ROW=FILTERS_PER_ROW)
-
- def plot_predictions(self):
- epoch, batch, data = self.get_next_batch(train=False) # get a test batch
- num_classes = self.test_data_provider.get_num_classes()
- NUM_ROWS = 2
- NUM_COLS = 4
- NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1]
- NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels
- NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs']
- PRED_IDX = 1
-
- label_names = [lab.split(',')[0] for lab in self.test_data_provider.batch_meta['label_names']]
- if self.only_errors:
- preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single)
- else:
- preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
- #rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS]
- rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
- if NUM_IMGS < data[0].shape[1]:
- data = [n.require(d[:,rand_idx], requirements='C') for d in data]
-# data += [preds]
- # Run the model
- print [d.shape for d in data], preds.shape
- self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
- IGPUModel.finish_batch(self)
- print preds
- data[0] = self.test_data_provider.get_plottable_data(data[0])
-
- if self.save_preds:
- if not gfile.Exists(self.save_preds):
- gfile.MakeDirs(self.save_preds)
- preds_thresh = preds > 0.5 # Binarize predictions
- data[0] = data[0] * 255.0
- data[0][data[0]<0] = 0
- data[0][data[0]>255] = 255
- data[0] = n.require(data[0], dtype=n.uint8)
- dir_name = '%s_predictions_batch_%d' % (os.path.basename(self.save_file), batch)
- tar_name = os.path.join(self.save_preds, '%s.tar' % dir_name)
- tfo = gfile.GFile(tar_name, "w")
- tf = TarFile(fileobj=tfo, mode='w')
- for img_idx in xrange(NUM_IMGS):
- img = data[0][img_idx,:,:,:]
- imsave = Image.fromarray(img)
- prefix = "CORRECT" if data[1][0,img_idx] == preds_thresh[img_idx,PRED_IDX] else "FALSE_POS" if preds_thresh[img_idx,PRED_IDX] == 1 else "FALSE_NEG"
- file_name = "%s_%.2f_%d_%05d_%d.png" % (prefix, preds[img_idx,PRED_IDX], batch, img_idx, data[1][0,img_idx])
-# gf = gfile.GFile(file_name, "w")
- file_string = StringIO()
- imsave.save(file_string, "PNG")
- tarinf = TarInfo(os.path.join(dir_name, file_name))
- tarinf.size = file_string.tell()
- file_string.seek(0)
- tf.addfile(tarinf, file_string)
- tf.close()
- tfo.close()
-# gf.close()
- print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name)
- else:
- fig = pl.figure(3, figsize=(12,9))
- fig.text(.4, .95, '%s test samples' % ('Mistaken' if self.only_errors else 'Random'))
- if self.only_errors:
- # what the net got wrong
- if NUM_OUTPUTS > 1:
- err_idx = [i for i,p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:,i] > 0)[0]]
- else:
- err_idx = n.where(data[1][0,:] != preds[:,0].T)[0]
- print err_idx
- err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS))
- data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]
-
-
- import matplotlib.gridspec as gridspec
- import matplotlib.colors as colors
- cconv = colors.ColorConverter()
- gs = gridspec.GridSpec(NUM_ROWS*2, NUM_COLS,
- width_ratios=[1]*NUM_COLS, height_ratios=[2,1]*NUM_ROWS )
- #print data[1]
- for row in xrange(NUM_ROWS):
- for col in xrange(NUM_COLS):
- img_idx = row * NUM_COLS + col
- if data[0].shape[0] <= img_idx:
- break
- pl.subplot(gs[(row * 2) * NUM_COLS + col])
- #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1)
- pl.xticks([])
- pl.yticks([])
- img = data[0][img_idx,:,:,:]
- pl.imshow(img, interpolation='lanczos')
- show_title = data[1].shape[0] == 1
- true_label = [int(data[1][0,img_idx])] if show_title else n.where(data[1][:,img_idx]==1)[0]
- #print true_label
- #print preds[img_idx,:].shape
- #print preds[img_idx,:].max()
- true_label_names = [label_names[i] for i in true_label]
- img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
- #print img_labels
- axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col])
- height = 0.5
- ylocs = n.array(range(NUM_TOP_CLASSES))*height
- pl.barh(ylocs, [l[0] for l in img_labels], height=height, \
- color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels])
- #pl.title(", ".join(true_labels))
- if show_title:
- pl.title(", ".join(true_label_names), fontsize=15, fontweight='bold')
- else:
- print true_label_names
- pl.yticks(ylocs + height/2, [l[1] for l in img_labels], x=1, backgroundcolor=cconv.to_rgba('0.65', alpha=0.5), weight='bold')
- for line in enumerate(axes.get_yticklines()):
- line[1].set_visible(False)
- #pl.xticks([width], [''])
- #pl.yticks([])
- pl.xticks([])
- pl.ylim(0, ylocs[-1] + height)
- pl.xlim(0, 1)
-
- def start(self):
- self.op.print_values()
-# print self.show_cost
- if self.show_cost:
- self.plot_cost()
- if self.show_filters:
- self.plot_filters()
- if self.show_preds:
- self.plot_predictions()
-
- if pl:
- pl.show()
- sys.exit(0)
-
- @classmethod
- def get_options_parser(cls):
- op = ConvNet.get_options_parser()
- for option in list(op.options):
- if option not in ('gpu', 'load_file', 'inner_size', 'train_batch_range', 'test_batch_range', 'multiview_test', 'data_path', 'pca_noise', 'scalar_mean'):
- op.delete_option(option)
- op.add_option("show-cost", "show_cost", StringOptionParser, "Show specified objective function", default="")
- op.add_option("show-filters", "show_filters", StringOptionParser, "Show learned filters in specified layer", default="")
- op.add_option("norm-filters", "norm_filters", BooleanOptionParser, "Individually normalize filters shown with --show-filters", default=0)
- op.add_option("input-idx", "input_idx", IntegerOptionParser, "Input index for layer given to --show-filters", default=0)
- op.add_option("cost-idx", "cost_idx", IntegerOptionParser, "Cost function return value index for --show-cost", default=0)
- op.add_option("no-rgb", "no_rgb", BooleanOptionParser, "Don't combine filter channels into RGB in layer given to --show-filters", default=False)
- op.add_option("yuv-to-rgb", "yuv_to_rgb", BooleanOptionParser, "Convert RGB filters to YUV in layer given to --show-filters", default=False)
- op.add_option("channels", "channels", IntegerOptionParser, "Number of channels in layer given to --show-filters (fully-connected layers only)", default=0)
- op.add_option("show-preds", "show_preds", StringOptionParser, "Show predictions made by given softmax on test set", default="")
- op.add_option("save-preds", "save_preds", StringOptionParser, "Save predictions to given path instead of showing them", default="")
- op.add_option("only-errors", "only_errors", BooleanOptionParser, "Show only mistaken predictions (to be used with --show-preds)", default=False, requires=['show_preds'])
- op.add_option("local-plane", "local_plane", IntegerOptionParser, "Local plane to show", default=0)
- op.add_option("smooth-test-errors", "smooth_test_errors", BooleanOptionParser, "Use running average for test error plot?", default=1)
-
- op.options['load_file'].default = None
- return op
-
-if __name__ == "__main__":
- #nr.seed(6)
- try:
- op = ShowConvNet.get_options_parser()
- op, load_dic = IGPUModel.parse_options(op)
- model = ShowConvNet(op, load_dic)
- model.start()
- except (UnpickleError, ShowNetError, opt.GetoptError), e:
- print "----------------"
- print "Error:"
- print e
+++ /dev/null
-# Copyright 2014 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-LINK_LIBS := -L$(ATLAS_LIB_PATH) -latlas -lcblas
-INCLUDES := -I./include
-COMMONFLAGS :=
-CC_ARGS :=
-CC=g++
-
-ifndef debug
- CC_ARGS += -O3
-endif
-
-OUT_DIR=./bin/$(OUT_SUFFIX)
-OUT_FILE=libutil.so
-
-ifeq ($(numpy), 1)
- PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
- LINK_LIBS += -lpython$(PYTHON_VERSION)
-
- INCLUDES += -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH)
- COMMONFLAGS += -DNUMPY_INTERFACE
- OUT_FILE=libutilpy.so
-endif
-
-OBJECTS = matrix.cpp
-
-all: dir classes $(OUT_FILE)
-
-dir:
- mkdir -p $(OUT_DIR)/src
-
-SOURCES = $(shell echo src/*.cpp)
-CLASSES = $(SOURCES:.cpp=.o)
-
-classes: $(CLASSES)
-
-%.o: %.cpp
- $(CC) $(CC_ARGS) -c -fPIC $(BUILD_ARGS) $(COMMONFLAGS) $(INCLUDES) $< -o $(OUT_DIR)/$*.o
-
-$(OUT_FILE): classes
- cd $(OUT_DIR) && $(CC) $(CC_ARGS) $(BUILD_ARGS) $(COMMONFLAGS) -shared -Wl,-no-undefined -o $(OUT_FILE) $(CLASSES) $(LINK_LIBS)
- ln -sf $(OUT_DIR)/$(OUT_FILE) .
-
-clean:
- rm -rf $(OUT_DIR)/*
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MATRIX_H_
-#define MATRIX_H_
-
-#include "matrix_funcs.h"
-#ifdef NUMPY_INTERFACE
-#include <Python.h>
-#include <arrayobject.h>
-#endif
-#include <limits>
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <vector>
-
-extern "C" {
-// #include <cblas.h>
-#include "caffe2/utils/cblas.h"
-}
-
-#ifdef DOUBLE_PRECISION
-#define CBLAS_GEMM cblas_dgemm
-#define CBLAS_SCAL cblas_dscal
-#define CBLAS_AXPY cblas_daxpy
-#else
-#define CBLAS_GEMM cblas_sgemm
-#define CBLAS_SCAL cblas_sscal
-#define CBLAS_AXPY cblas_saxpy
-#endif /* DOUBLE_PRECISION */
-
-#define MTYPE_MAX numeric_limits<MTYPE>::max()
-
-typedef long long int int64;
-
-class Matrix {
-private:
- MTYPE* _data;
- bool _ownsData;
- int64 _numRows, _numCols;
- int64 _numElements;
- CBLAS_TRANSPOSE _trans;
-
- void _init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData);
- void _tileTo2(Matrix& target) const;
- void _copyAllTo(Matrix& target) const;
- MTYPE _sum_column(int64 col) const;
- MTYPE _sum_row(int64 row) const;
- MTYPE _aggregate(MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
- void _aggregate(int64 axis, Matrix& target, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
- MTYPE _aggregateRow(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
- MTYPE _aggregateCol(int64 row, MTYPE(*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const;
- void _updateDims(int64 numRows, int64 numCols);
- void _applyLoop(MTYPE(*func)(MTYPE));
- void _applyLoop(MTYPE (*func)(MTYPE), Matrix& target);
- void _applyLoop2(const Matrix& a, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const;
- void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const;
- void _applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const;
- void _applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const;
- void _checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const;
- void _divideByVector(const Matrix& vec, Matrix& target);
- inline int64 _getNumColsBackEnd() const {
- return _trans == CblasNoTrans ? _numCols : _numRows;
- }
-public:
- enum FUNCTION {
- TANH, RECIPROCAL, SQUARE, ABS, EXP, LOG, ZERO, ONE, LOGISTIC1, LOGISTIC2, SIGN
- };
- Matrix();
- Matrix(int64 numRows, int64 numCols);
- Matrix(int64 numRows, int64 numCols, bool transpose);
-#ifdef NUMPY_INTERFACE
- Matrix(const PyArrayObject *src);
-#endif
- Matrix(const Matrix &like);
- Matrix(MTYPE* data, int64 numRows, int64 numCols);
- Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose);
- ~Matrix();
-
- inline MTYPE& getCell(int64 i, int64 j) const {
- assert(i >= 0 && i < _numRows);
- assert(j >= 0 && j < _numCols);
- if (_trans == CblasTrans) {
- return _data[j * _numRows + i];
- }
- return _data[i * _numCols + j];
- }
-
- MTYPE& operator()(int64 i, int64 j) const {
- return getCell(i, j);
- }
-
- inline MTYPE* getData() const {
- return _data;
- }
-
- inline bool isView() const {
- return !_ownsData;
- }
-
- inline int64 getNumRows() const {
- return _numRows;
- }
-
- inline int64 getNumCols() const {
- return _numCols;
- }
-
- inline int64 getNumDataBytes() const {
- return _numElements * sizeof(MTYPE);
- }
-
- inline int64 getNumElements() const {
- return _numElements;
- }
-
- inline int64 getLeadingDim() const {
- return _trans == CblasTrans ? _numRows : _numCols;
- }
-
- inline int64 getFollowingDim() const {
- return _trans == CblasTrans ? _numCols : _numRows;
- }
-
- inline CBLAS_TRANSPOSE getBLASTrans() const {
- return _trans;
- }
-
- inline bool isSameDims(const Matrix& a) const {
- return a.getNumRows() == getNumRows() && a.getNumCols() == getNumCols();
- }
-
- inline bool isTrans() const {
- return _trans == CblasTrans;
- }
-
- /*
- * Only use if you know what you're doing!
- * Does not update any dimensions. Just flips the _trans flag.
- *
- * Use transpose() if you want to get the transpose of this matrix.
- */
- inline void setTrans(bool trans) {
- assert(isTrans() == trans || !isView());
- _trans = trans ? CblasTrans : CblasNoTrans;
- }
-
- void apply(FUNCTION f);
- void apply(Matrix::FUNCTION f, Matrix& target);
- void subtractFromScalar(MTYPE scalar);
- void subtractFromScalar(MTYPE scalar, Matrix &target) const;
- void biggerThanScalar(MTYPE scalar);
- void smallerThanScalar(MTYPE scalar);
- void equalsScalar(MTYPE scalar);
- void biggerThanScalar(MTYPE scalar, Matrix& target) const;
- void smallerThanScalar(MTYPE scalar, Matrix& target) const;
- void equalsScalar(MTYPE scalar, Matrix& target) const;
- void biggerThan(Matrix& a);
- void biggerThan(Matrix& a, Matrix& target) const;
- void smallerThan(Matrix& a);
- void smallerThan(Matrix& a, Matrix& target) const;
- void minWith(Matrix &a);
- void minWith(Matrix &a, Matrix &target) const;
- void maxWith(Matrix &a);
- void maxWith(Matrix &a, Matrix &target) const;
- void equals(Matrix& a);
- void equals(Matrix& a, Matrix& target) const;
- void notEquals(Matrix& a) ;
- void notEquals(Matrix& a, Matrix& target) const;
- void add(const Matrix &m);
- void add(const Matrix &m, MTYPE scale);
- void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM);
- void add(const Matrix &m, Matrix& target);
- void add(const Matrix &m, MTYPE scaleM, Matrix &target);
- void add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target);
- void subtract(const Matrix &m);
- void subtract(const Matrix &m, Matrix& target);
- void subtract(const Matrix &m, MTYPE scale);
- void subtract(const Matrix &m, MTYPE scale, Matrix& target);
- void addVector(const Matrix& vec, MTYPE scale);
- void addVector(const Matrix& vec, MTYPE scale, Matrix& target);
- void addVector(const Matrix& vec);
- void addVector(const Matrix& vec, Matrix& target);
- void addScalar(MTYPE scalar);
- void addScalar(MTYPE scalar, Matrix& target) const;
- void maxWithScalar(MTYPE scalar);
- void maxWithScalar(MTYPE scalar, Matrix &target) const;
- void minWithScalar(MTYPE scalar);
- void minWithScalar(MTYPE scalar, Matrix &target) const;
- void eltWiseMultByVector(const Matrix& vec);
- void eltWiseMultByVector(const Matrix& vec, Matrix& target);
- void eltWiseDivideByVector(const Matrix& vec);
- void eltWiseDivideByVector(const Matrix& vec, Matrix& target);
- void resize(int64 newNumRows, int64 newNumCols);
- void resize(const Matrix& like);
- Matrix& slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const;
- void slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix &target) const;
- Matrix& sliceRows(int64 startRow, int64 endRow) const;
- void sliceRows(int64 startRow, int64 endRow, Matrix& target) const;
- Matrix& sliceCols(int64 startCol, int64 endCol) const;
- void sliceCols(int64 startCol, int64 endCol, Matrix& target) const;
- void rightMult(const Matrix &b, MTYPE scale);
- void rightMult(const Matrix &b, Matrix &target) const;
- void rightMult(const Matrix &b);
- void rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const;
- void addProduct(const Matrix &a, const Matrix &b, MTYPE scaleAB, MTYPE scaleThis);
- void addProduct(const Matrix& a, const Matrix& b);
- void eltWiseMult(const Matrix& a);
- void eltWiseMult(const Matrix& a, Matrix& target) const;
- void eltWiseDivide(const Matrix& a);
- void eltWiseDivide(const Matrix& a, Matrix &target) const;
- Matrix& transpose() const;
- Matrix& transpose(bool hard) const;
- Matrix& tile(int64 timesY, int64 timesX) const;
- void tile(int64 timesY, int64 timesX, Matrix& target) const;
- void copy(Matrix &dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const;
- Matrix& copy() const;
- void copy(Matrix& target) const;
- Matrix& sum(int64 axis) const;
- void sum(int64 axis, Matrix &target) const;
- MTYPE sum() const;
- MTYPE max() const;
- Matrix& max(int64 axis) const;
- void max(int64 axis, Matrix& target) const;
- MTYPE min() const;
- Matrix& min(int64 axis) const;
- void min(int64 axis, Matrix& target) const;
- MTYPE norm() const;
- MTYPE norm2() const;
- void scale(MTYPE scale);
- void scale(MTYPE alpha, Matrix& target);
- void reshape(int64 numRows, int64 numCols);
- Matrix& reshaped(int64 numRows, int64 numCols);
- void printShape(const char* name) const;
- bool hasNan() const;
- bool hasInf() const;
-
- void randomizeNormal(MTYPE mean, MTYPE stdev);
- void randomizeUniform();
- void randomizeNormal();
- void print() const;
- void print(int64 startRow,int64 rows, int64 startCol,int64 cols) const;
- void print(int64 rows, int64 cols) const;
-};
-
-typedef std::vector<Matrix*> MatrixV;
-
-#endif /* MATRIX_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MATRIX_FUNCS_H_
-#define MATRIX_FUNCS_H_
-
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-
-#ifdef DOUBLE_PRECISION
-#define MTYPE double
-#else
-#define MTYPE float
-#endif
-
-#define MYRAND ((double)rand() / ((double)RAND_MAX + 1))
-
-inline MTYPE _zero(MTYPE /*x*/) {
- return 0;
-}
-
-inline MTYPE _one(MTYPE /*x*/) {
- return 1;
-}
-
-inline MTYPE _abs(MTYPE x) {
- return x > 0 ? x : -x;
-}
-
-inline MTYPE _square(MTYPE x) {
- return x * x;
-}
-
-inline MTYPE _sigma1(MTYPE x) {
- return (tanh(x / 2) + 1) / 2;
-}
-
-inline MTYPE _sigma2(MTYPE x) {
- return 1 / (1 + exp(-x));
-}
-
-inline MTYPE _recip(MTYPE x) {
- return 1 / x;
-}
-
-inline MTYPE _exp(MTYPE x) {
- return exp(x);
-}
-
-inline MTYPE _log(MTYPE x) {
- return log(x);
-}
-
-inline MTYPE _tanh(MTYPE x) {
- return tanh(x);
-}
-
-inline MTYPE _sign(MTYPE x) {
- return x > 0 ? 1 : -1;
-}
-
-inline MTYPE _rand(MTYPE /*x*/) {
- return MYRAND;
-}
-
-inline MTYPE _divide(MTYPE x, MTYPE y) {
- return x / y;
-}
-
-inline MTYPE _mult(MTYPE x, MTYPE y) {
- return x * y;
-}
-
-inline MTYPE _add(MTYPE x, MTYPE y) {
- return x + y;
-}
-
-inline MTYPE _addSquare(MTYPE x, MTYPE y) {
- return x*x + y;
-}
-
-inline MTYPE _addWithScale(MTYPE x, MTYPE y, MTYPE scale) {
- return x + scale*y;
-}
-
-inline MTYPE _addWithScale2(MTYPE x, MTYPE y, MTYPE scaleThis, MTYPE scaleM) {
- return scaleThis * x + scaleM * y;
-}
-
-inline MTYPE _max(MTYPE x, MTYPE y) {
- return std::max(x, y);
-}
-
-inline MTYPE _min(MTYPE x, MTYPE y) {
- return std::min(x, y);
-}
-
-inline MTYPE _bigger(MTYPE x, MTYPE y) {
- return x > y;
-}
-
-inline MTYPE _smaller(MTYPE x, MTYPE y) {
- return x < y;
-}
-
-inline MTYPE _equal(MTYPE x, MTYPE y) {
- return x == y;
-}
-
-inline MTYPE _notEqual(MTYPE x, MTYPE y) {
- return x != y;
-}
-
-#endif /* MATRIX_FUNCS_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef QUEUE_H_
-#define QUEUE_H_
-#include <pthread.h>
-#include <stdlib.h>
-
-/*
- * A thread-safe circular queue that automatically grows but never shrinks.
- */
-template <class T>
-class Queue {
-private:
- T *_elements;
- int _numElements;
- int _head, _tail;
- int _maxSize;
- pthread_mutex_t *_queueMutex;
- pthread_cond_t *_queueCV;
-
- void _init(int initialSize) {
- _numElements = 0;
- _head = 0;
- _tail = 0;
- _maxSize = initialSize;
- _elements = new T[initialSize];
- _queueCV = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
- _queueMutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
- pthread_mutex_init(_queueMutex, NULL);
- pthread_cond_init(_queueCV, NULL);
- }
-
- void expand() {
- T *newStorage = new T[_maxSize * 2];
- memcpy(newStorage, _elements + _head, (_maxSize - _head) * sizeof(T));
- memcpy(newStorage + _maxSize - _head, _elements, _tail * sizeof(T));
- delete[] _elements;
- _elements = newStorage;
- _head = 0;
- _tail = _numElements;
- _maxSize *= 2;
- }
-public:
- Queue(int initialSize) {
- _init(initialSize);
- }
-
- Queue() {
- _init(1);
- }
-
- ~Queue() {
- pthread_mutex_destroy(_queueMutex);
- pthread_cond_destroy(_queueCV);
- delete[] _elements;
- free(_queueMutex);
- free(_queueCV);
- }
-
- void enqueue(T el) {
- pthread_mutex_lock(_queueMutex);
- if (_numElements == _maxSize) {
- expand();
- }
- _elements[_tail] = el;
- _tail = (_tail + 1) % _maxSize;
- _numElements++;
-
- pthread_cond_signal(_queueCV);
- pthread_mutex_unlock(_queueMutex);
- }
-
- /*
- * Blocks until not empty.
- */
- T dequeue() {
- pthread_mutex_lock(_queueMutex);
- // Apparently, pthread_cond_signal may actually unblock
- // multiple threads, so a while loop is needed here.
- while (_numElements == 0) {
- pthread_cond_wait(_queueCV, _queueMutex);
- }
- T el = _elements[_head];
- _head = (_head + 1) % _maxSize;
- _numElements--;
- pthread_mutex_unlock(_queueMutex);
- return el;
- }
-
- /*
- * Obviously this number can change by the time you actually look at it.
- */
- inline int getNumElements() const {
- return _numElements;
- }
-};
-
-#endif /* QUEUE_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SYNC_H_
-#define SYNC_H_
-
-#include <pthread.h>
-
-class Lock {
-private:
- pthread_mutex_t _mutex;
-public:
- Lock() {
- pthread_mutex_init(&_mutex, NULL);
- }
- ~Lock() {
- pthread_mutex_destroy(&_mutex);
- }
-
- void acquire() {
- pthread_mutex_lock(&_mutex);
- }
-
- void release() {
- pthread_mutex_unlock(&_mutex);
- }
-};
-
-class ThreadSynchronizer {
-private:
- int _numThreads;
- int _numSynced;
- pthread_mutex_t *_syncMutex;
- pthread_cond_t *_syncThresholdCV;
-public:
- ThreadSynchronizer(int numThreads) {
- _numThreads = numThreads;
- _numSynced = 0;
- _syncMutex = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
- _syncThresholdCV = (pthread_cond_t*) malloc(sizeof(pthread_cond_t));
- pthread_mutex_init(_syncMutex, NULL);
- pthread_cond_init(_syncThresholdCV, NULL);
- }
-
- ~ThreadSynchronizer() {
- pthread_mutex_destroy(_syncMutex);
- pthread_cond_destroy(_syncThresholdCV);
- free(_syncMutex);
- free(_syncThresholdCV);
- }
-
- void sync() {
- pthread_mutex_lock(_syncMutex);
- _numSynced++;
-
- if (_numSynced == _numThreads) {
- _numSynced = 0;
- pthread_cond_broadcast(_syncThresholdCV);
- } else {
- pthread_cond_wait(_syncThresholdCV, _syncMutex);
- }
- pthread_mutex_unlock(_syncMutex);
- }
-};
-
-#endif /* SYNC_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef THREAD_H_
-#define THREAD_H_
-#include <pthread.h>
-#include <stdio.h>
-#include <errno.h>
-#include <assert.h>
-#include <vector>
-
-#define NUM_CPUS_MAX 48
-
-/*
- * Abstract joinable thread class.
- * The only thing the implementer has to fill in is the run method.
- */
-class Thread {
-private:
- cpu_set_t *_cpu_set;
- pthread_attr_t _pthread_attr;
- pthread_t _threadID;
- bool _joinable, _startable;
-
- static void* start_pthread_func(void *obj) {
- void* retval = reinterpret_cast<Thread*>(obj)->run();
- pthread_exit(retval);
- return retval;
- }
-protected:
- virtual void* run() = 0;
-public:
- Thread(bool joinable) : _cpu_set(NULL), _joinable(joinable), _startable(true) {
- pthread_attr_init(&_pthread_attr);
- }
-
- Thread(bool joinable, std::vector<int>& cpus) : _cpu_set(NULL), _joinable(joinable), _startable(true) {
- pthread_attr_init(&_pthread_attr);
- setAffinity(cpus);
- }
-
- virtual ~Thread() {
- if (_cpu_set != NULL) {
- CPU_FREE(_cpu_set);
- }
- pthread_attr_destroy(&_pthread_attr);
- }
-
- void setAffinity(std::vector<int>& cpus) {
- assert(_startable);
- _cpu_set = CPU_ALLOC(NUM_CPUS_MAX);
- size_t size = CPU_ALLOC_SIZE(NUM_CPUS_MAX);
- if (cpus.size() > 0 && cpus[0] >= 0) {
- CPU_ZERO_S(size, _cpu_set);
- for (int i = 0; i < cpus.size(); i++) {
- assert(cpus[i] < NUM_CPUS_MAX);
- CPU_SET_S(cpus[i], size, _cpu_set);
-// printf("set cpu %d\n", cpus[i]);
- }
- pthread_attr_setaffinity_np(&_pthread_attr, size, _cpu_set);
- }
- }
-
- pthread_t start() {
- assert(_startable);
- _startable = false;
- pthread_attr_setdetachstate(&_pthread_attr, _joinable ? PTHREAD_CREATE_JOINABLE : PTHREAD_CREATE_DETACHED);
- int n;
- if ((n = pthread_create(&_threadID, &_pthread_attr, &Thread::start_pthread_func, (void*)this))) {
- errno = n;
- perror("pthread_create error");
- }
- return _threadID;
- }
-
- void join(void **status) {
- assert(_joinable);
- int n;
- if((n = pthread_join(_threadID, status))) {
- errno = n;
- perror("pthread_join error");
- }
- }
-
- void join() {
- join(NULL);
- }
-
- pthread_t getThreadID() const {
- return _threadID;
- }
-
- bool isStartable() const {
- return _startable;
- }
-};
-
-#endif /* THREAD_H_ */
+++ /dev/null
-/*
- * Copyright 2014 Google Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../include/matrix.h"
-#include "../include/matrix_funcs.h"
-
-#if defined(_WIN64) || defined(_WIN32)
-double sqrt(int _X) {return sqrt((double) _X);}
-double log(int _X) {return log((double) _X);}
-#endif
-
-using namespace std;
-
-void Matrix::_init(MTYPE* data, int64 numRows, int64 numCols, bool transpose, bool ownsData) {
- _updateDims(numRows, numCols);
- _ownsData = ownsData;
- _trans = transpose ? CblasTrans : CblasNoTrans;
- _data = data;
-}
-
-Matrix::Matrix() {
- _init(NULL, 0, 0, false, true);
-}
-
-Matrix::Matrix(int64 numRows, int64 numCols) {
- _init(NULL, numRows, numCols, false, true);
- this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL;
-}
-
-Matrix::Matrix(int64 numRows, int64 numCols, bool transpose) {
- _init(NULL, numRows, numCols, transpose, true);
- this->_data = numRows * numCols > 0 ? new MTYPE[this->_numElements] : NULL;
-}
-
-Matrix::Matrix(const Matrix &like) {
- _init(NULL, like.getNumRows(), like.getNumCols(), false, true);
- this->_data = new MTYPE[this->_numElements];
-}
-
-/* construct a matrix with another matrix's data. the resultant
- * matrix does NOT own its data */
-Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols) {
- _init(data, numRows, numCols, false, false);
-}
-
-/* construct a matrix with another matrix's data (and optionally transpose it). the resultant
- * matrix does NOT own its data -- it is a VIEW */
-Matrix::Matrix(MTYPE* data, int64 numRows, int64 numCols, bool transpose) {
- _init(data, numRows, numCols, transpose, false);
-}
-
-#ifdef NUMPY_INTERFACE
-Matrix::Matrix(const PyArrayObject *src) {
- this->_data = NULL;
- this->_trans = CblasNoTrans;
- if (src != NULL) {
- this->_updateDims(PyArray_DIM(src,0), PyArray_DIM(src,1));
- if (src->flags & NPY_CONTIGUOUS || src->flags & NPY_FORTRAN) {
- this->_data = (MTYPE*) src->data;
- this->_ownsData = false;
- this->_trans = src->flags & NPY_CONTIGUOUS ? CblasNoTrans : CblasTrans;
- } else {
- this->_data = new MTYPE[PyArray_DIM(src,0) * PyArray_DIM(src,1)];
- for (int64 i = 0; i < PyArray_DIM(src,0); i++) {
- for (int64 j = 0; j < PyArray_DIM(src,1); j++) {
- (*this)(i,j) = *reinterpret_cast<MTYPE*>(PyArray_GETPTR2(src,i,j));
- }
- }
- this->_ownsData = true;
- }
- }
-}
-#endif
-Matrix::~Matrix() {
- if(this->_data != NULL && this->_ownsData) {
- delete[] this->_data;
- }
-}
-
-void Matrix::_updateDims(int64 numRows, int64 numCols) {
- this->_numRows = numRows;
- this->_numCols = numCols;
- this->_numElements = numRows * numCols;
-}
-
-void Matrix::_checkBounds(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const {
- assert(startRow >= 0 && startRow <= _numRows);
- assert(endRow >= 0 && endRow <= _numRows);
- assert(startCol >= 0 && startCol <= _numCols);
- assert(endCol >= 0 && endCol <= _numCols);
-}
-
-/* will return a view if possible */
-Matrix& Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol) const {
- endRow = endRow < 0 ? this->_numRows : endRow;
- endCol = endCol < 0 ? this->_numCols : endCol;
- _checkBounds(startRow, endRow, startCol, endCol);
- if (!isTrans() && ((startCol == 0 && endCol == this->_numCols) || (startRow == endRow - 1))) {
- return *new Matrix(this->_data + startRow * this->_numCols + startCol, endRow - startRow, endCol - startCol);
- } else if (isTrans() && ((startRow == 0 && endRow == this->_numRows) || (startCol == endCol - 1))) {
- return *new Matrix(this->_data + startCol * this->_numRows + startRow, endRow - startRow, endCol - startCol, true);
- }
- Matrix& newSlice = *new Matrix(endRow - startRow, endCol - startCol);
- this->copy(newSlice, startRow, endRow, startCol, endCol, 0, 0);
- return newSlice;
-}
-
-/* this will NEVER return a view, unlike Matrix_slice */
-void Matrix::slice(int64 startRow, int64 endRow, int64 startCol, int64 endCol, Matrix& target) const {
- endRow = endRow < 0 ? this->_numRows : endRow;
- endCol = endCol < 0 ? this->_numCols : endCol;
- _checkBounds(startRow, endRow, startCol, endCol);
- target.resize(endRow - startRow, endCol - startCol);
- this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
-}
-
-Matrix& Matrix::sliceRows(int64 startRow, int64 endRow) const {
- return slice(startRow, endRow, 0, -1);
-}
-
-void Matrix::sliceRows(int64 startRow, int64 endRow, Matrix& target) const {
- slice(startRow, endRow, 0, -1, target);
-}
-
-Matrix& Matrix::sliceCols(int64 startCol, int64 endCol) const {
- return slice(0, -1, startCol, endCol);
-}
-
-void Matrix::sliceCols(int64 startCol, int64 endCol, Matrix& target) const {
- slice(0, -1, startCol, endCol, target);
-}
-
-void Matrix::subtractFromScalar(MTYPE scalar) {
- subtractFromScalar(scalar, *this);
-}
-
-void Matrix::subtractFromScalar(MTYPE scalar, Matrix& target) const {
- if(&target != this) {
- copy(target);
- }
- target.scale(-1);
- target.addScalar(scalar);
-}
-
-void Matrix::biggerThanScalar(MTYPE scalar) {
- biggerThanScalar(scalar, *this);
-}
-
-void Matrix::smallerThanScalar(MTYPE scalar) {
- smallerThanScalar(scalar, *this);
-}
-
-void Matrix::equalsScalar(MTYPE scalar) {
- equalsScalar(scalar, *this);
-}
-
-void Matrix::biggerThanScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_bigger, target);
-}
-
-void Matrix::smallerThanScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_smaller, target);
-}
-
-void Matrix::equalsScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_equal, target);
-}
-
-void Matrix::add(const Matrix &m) {
- add(m, 1, *this);
-}
-
-void Matrix::add(const Matrix &m, Matrix& target) {
- add(m, 1, target);
-}
-
-void Matrix::add(const Matrix &m, MTYPE scale) {
- add(m, scale, *this);
-}
-
-void Matrix::subtract(const Matrix &m) {
- add(m, -1, *this);
-}
-
-void Matrix::subtract(const Matrix &m, Matrix& target) {
- add(m, -1, target);
-}
-
-void Matrix::subtract(const Matrix &m, MTYPE scale) {
- add(m, -scale, *this);
-}
-
-void Matrix::subtract(const Matrix &m, MTYPE scale, Matrix& target) {
- add(m, -scale, target);
-}
-
-void Matrix::add(const Matrix &m, MTYPE scaleM, Matrix &target) {
- add(m, 1, scaleM, target);
-}
-
-void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM) {
- add(m, scaleThis, scaleM, *this);
-}
-
-void Matrix::add(const Matrix &m, MTYPE scaleThis, MTYPE scaleM, Matrix &target) {
- assert(this->isSameDims(m));
- if (isTrans() != m.isTrans() || isTrans() != target.isTrans() || scaleThis != 1) {
- if (&target != this) {
- target.resize(*this);
- }
- if(scaleThis == 1 && scaleM == 1) {
- this->_applyLoop2(m, &_add, target);
- } else if (scaleThis == 1) {
- this->_applyLoop2(m, &_addWithScale, scaleM, target);
- } else {
- this->_applyLoop2(m, &_addWithScale2, scaleThis, scaleM, target);
- }
- } else {
- if (&target != this) {
- copy(target);
- }
- CBLAS_AXPY(getNumElements(), scaleM, m._data, 1, target._data, 1);
- }
-}
-
-void Matrix::addScalar(MTYPE scalar) {
- addScalar(scalar, *this);
-}
-
-void Matrix::addScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_add, target);
-}
-
-void Matrix::maxWithScalar(MTYPE scalar) {
- maxWithScalar(scalar, *this);
-}
-
-void Matrix::maxWithScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_max, target);
-}
-
-void Matrix::minWithScalar(MTYPE scalar) {
- minWithScalar(scalar, *this);
-}
-
-void Matrix::minWithScalar(MTYPE scalar, Matrix& target) const {
- target.resize(*this);
- _applyLoopScalar(scalar, &_min, target);
-}
-
-void Matrix::biggerThan(Matrix& a) {
- biggerThan(a, *this);
-}
-
-void Matrix::biggerThan(Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_bigger, target);
-}
-
-void Matrix::smallerThan(Matrix& a) {
- smallerThan(a, *this);
-}
-
-void Matrix::smallerThan(Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_smaller, target);
-}
-
-void Matrix::equals(Matrix& a) {
- equals(a, *this);
-}
-
-void Matrix::equals(Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_equal, target);
-}
-
-void Matrix::notEquals(Matrix& a) {
- notEquals(a, *this);
-}
-
-void Matrix::notEquals(Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_notEqual, target);
-}
-
-void Matrix::minWith(Matrix &a) {
- minWith(a, *this);
-}
-
-void Matrix::minWith(Matrix &a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_min, target);
-}
-
-void Matrix::maxWith(Matrix &a) {
- maxWith(a, *this);
-}
-
-void Matrix::maxWith(Matrix &a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- _applyLoop2(a, &_max, target);
-}
-
-/* this := this + scale*tile(vec) */
-void Matrix::addVector(const Matrix& vec, MTYPE scale, Matrix& target) {
- if(&target != this) {
- copy(target);
- }
- assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
- const bool rowVector = vec.getNumRows() == 1;
- const bool colVector = vec.getNumCols() == 1;
- assert((rowVector && vec.getNumCols() == target.getNumCols()) || (colVector && vec.getNumRows() == target.getNumRows()));
- if (rowVector && colVector) {
- addScalar(vec(0,0) * scale, target);
- return;
- }
- const int64 loopTil = rowVector ? target.getNumRows() : target.getNumCols();
- const int64 dataInc = ((rowVector && target.isTrans()) || (!rowVector && !target.isTrans())) ? 1 : (rowVector ? target.getNumCols() : target.getNumRows());
- const int64 myStride = ((target.isTrans() && rowVector) || (!target.isTrans() && !rowVector)) ? loopTil : 1;
- for (int64 i = 0; i < loopTil; i++) {
- CBLAS_AXPY(vec.getNumElements(), scale, vec._data, 1, target._data + dataInc * i, myStride);
- }
-}
-
-/* this := this + scale*tile(vec) */
-void Matrix::addVector(const Matrix& vec, MTYPE scale) {
- addVector(vec, scale, *this);
-}
-
-void Matrix::addVector(const Matrix& vec) {
- addVector(vec, 1, *this);
-}
-
-void Matrix::addVector(const Matrix& vec, Matrix& target) {
- addVector(vec, 1, target);
-}
-
-void Matrix::eltWiseMultByVector(const Matrix& vec) {
- eltWiseMultByVector(vec, *this);
-}
-
-/* omg test these */
-void Matrix::eltWiseMultByVector(const Matrix& vec, Matrix& target) {
- if(&target != this) {
- copy(target);
- }
- assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
- const bool rowVector = vec.getNumRows() == 1;
- assert((rowVector && vec.getNumCols() == target.getNumCols()) || (!rowVector && vec.getNumRows() == target.getNumRows()));
- const int64 dataInc = ((rowVector && !target.isTrans()) || (!rowVector && target.isTrans())) ? 1 : (rowVector ? target.getNumRows() : target.getNumCols());
- const int64 myStride = ((!target.isTrans() && !rowVector) || (target.isTrans() && rowVector)) ? 1 : vec.getNumElements();
- const int64 numScaling = rowVector ? target.getNumRows() : target.getNumCols();
- for (int64 i = 0; i < vec.getNumElements(); i++) {
- CBLAS_SCAL(numScaling, vec._data[i], target._data + dataInc * i, myStride);
- }
-}
-
-/* return := scale * this * b */
-void Matrix::rightMult(const Matrix& b, MTYPE scale) {
- rightMult(b, scale, *this);
-}
-
-/* return := this * b */
-void Matrix::rightMult(const Matrix& b) {
- rightMult(b, 1);
-}
-
-/* target := this * b
- * also resizes target if necessary.*/
-void Matrix::rightMult(const Matrix &b, Matrix &target) const {
- rightMult(b, 1, target);
-}
-
-/* target := scaleAB * this * b
- * also resizes target if necessary.*/
-void Matrix::rightMult(const Matrix &b, MTYPE scaleAB, Matrix &target) const {
- if(&target != this) {
- target.resize(this->_numRows, b._numCols);
- }
- target.addProduct(*this, b, scaleAB, 0);
-}
-
-/* this := scaleAB * a*b + scaleC * this
- * ALL SIZES MUST BE CORRECT. */
-void Matrix::addProduct(const Matrix& a, const Matrix& b, MTYPE scaleAB, MTYPE scaleThis) {
- assert(a.getNumCols() == b.getNumRows());
- assert(this->getNumRows() == a.getNumRows() && this->getNumCols() == b.getNumCols());
- assert(!isTrans());
- CBLAS_GEMM(CblasRowMajor, a._trans, b._trans, a._numRows, b._numCols, a._numCols, scaleAB, a._data,
- a._getNumColsBackEnd(), b._data, b._getNumColsBackEnd(), scaleThis, this->_data, this->_numCols);
-}
-
-void Matrix::addProduct(const Matrix& a, const Matrix& b) {
- addProduct(a, b, 1, 1);
-}
-
-Matrix& Matrix::transpose() const {
- return *new Matrix(this->_data, this->_numCols, this->_numRows, !isTrans());
-}
-
-Matrix& Matrix::transpose(bool hard) const {
- if (!hard || isTrans()) {
- return transpose();
- }
- Matrix &meTrans = *new Matrix(_numCols, _numRows);
- for (int64 i = 0; i < _numRows; i++) {
- for (int64 j = 0; j < _numCols; j++) {
- meTrans(j, i) = (*this)(i, j);
- }
- }
- return meTrans;
-}
-
-Matrix& Matrix::tile(int64 timesY, int64 timesX) const {
- Matrix& tiled = *new Matrix(this->_numRows * timesY, this->_numCols * timesX);
- _tileTo2(tiled);
- return tiled;
-}
-
-/* resizes target if necessary */
-void Matrix::tile(int64 timesY, int64 timesX, Matrix& target) const {
- target.resize(this->_numRows * timesY, this->_numCols * timesX);
- _tileTo2(target);
-}
-
-/* a variant ... seems to be no faster than original. */
-void Matrix::_tileTo2(Matrix& target) const {
- for(int64 y = 0; y < target._numRows; y += this->_numRows) {
- for(int64 x = 0; x < target._numCols; x += this->_numCols) {
- this->copy(target, 0, -1, 0, -1, y, x);
- }
- }
-}
-
-/* guarantees that result will be non-transposed */
-void Matrix::resize(int64 newNumRows, int64 newNumCols) {
- if(this->_numRows != newNumRows || this->_numCols != newNumCols) {
- assert(!isView());
- if (this->getNumElements() != newNumRows * newNumCols) {
- delete[] this->_data; //deleting NULL is ok, sez c++
- this->_data = new MTYPE[newNumRows * newNumCols];
- }
- this->_updateDims(newNumRows, newNumCols);
- this->_trans = CblasNoTrans;
- }
-}
-
-void Matrix::resize(const Matrix& like) {
- resize(like.getNumRows(), like.getNumCols());
-}
-
-void Matrix::scale(MTYPE alpha) {
- scale(alpha, *this);
-}
-
-void Matrix::scale(MTYPE alpha, Matrix& target) {
- if (&target != this) {
- target.resize(*this);
- copy(target);
- }
- CBLAS_SCAL(getNumElements(), alpha, target._data, 1);
-}
-
-/* performs no resizing.
- * Warnings:
- * 1. ALL DIMENSIONS MUST BE CORRECT
- * 2. The source and destination memories better not overlap! */
-void Matrix::copy(Matrix& dest, int64 srcStartRow, int64 srcEndRow, int64 srcStartCol, int64 srcEndCol, int64 destStartRow, int64 destStartCol) const {
- srcEndRow = srcEndRow < 0 ? this->_numRows : srcEndRow;
- srcEndCol = srcEndCol < 0 ? this->_numCols : srcEndCol;
- assert(destStartRow >= 0 && destStartCol >= 0); //some range-checking
- assert(srcEndRow <= _numRows && srcEndCol <= _numCols);
- assert(destStartRow + srcEndRow - srcStartRow <= dest.getNumRows());
- assert(destStartCol + srcEndCol - srcStartCol <= dest.getNumCols());
- // I found no evidence that memcpy is actually faster than just
- // copying element-by-element.
- if (!isTrans() && !dest.isTrans()) {
- int64 src_start_idx = this->_numCols * srcStartRow + srcStartCol;
- int64 dest_start_idx = dest._numCols * destStartRow + destStartCol;
- int64 copy_row_width = srcEndCol - srcStartCol;
-
- for (int64 i = srcStartRow; i < srcEndRow; i++) {
- memcpy(dest._data + dest_start_idx + dest._numCols * (i - srcStartRow),
- this->_data + src_start_idx + this->_numCols * (i - srcStartRow), sizeof(MTYPE) * copy_row_width);
- }
- } else {
- for (int64 i = srcStartRow; i < srcEndRow; i++) {
- for (int64 j = srcStartCol; j < srcEndCol; j++) {
- dest(i - srcStartRow + destStartRow, j - srcStartCol + destStartCol) = (*this)(i, j);
- }
- }
- }
-}
-
-/* preserves everything excluding transposedness.
- * new matrix owns its data */
-Matrix& Matrix::copy() const {
- Matrix& copy = *new Matrix(*this);
- this->copy(copy);
- return copy;
-}
-
-/* resizes target if necessary */
-void Matrix::copy(Matrix& target) const {
- target.resize(this->_numRows, this->_numCols); //target is now non-transposed
- if(this->isTrans() == target.isTrans()) {
- this->_copyAllTo(target);
- } else { //if I'm transposed, make sure that target is non-transposed copy
- this->copy(target, 0, -1, 0, -1, 0, 0);
- }
-}
-
-void Matrix::_copyAllTo(Matrix& target) const {
- assert(target.isTrans() == isTrans());
- memcpy((void*) target._data, (void*) this->_data, this->getNumDataBytes());
- target._trans = this->_trans;
-}
-
-MTYPE Matrix::min() const {
- return _aggregate(&_min, MTYPE_MAX);
-}
-
-Matrix& Matrix::min(int64 axis) const {
- Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
- this->min(axis, target);
- return target;
-}
-
-void Matrix::min(int64 axis, Matrix& target) const {
- _aggregate(axis, target, &_min, MTYPE_MAX);
-}
-
-MTYPE Matrix::max() const {
- return _aggregate(&_max, -MTYPE_MAX);
-}
-
-Matrix& Matrix::max(int64 axis) const {
- Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
- this->max(axis, target);
- return target;
-}
-
-void Matrix::max(int64 axis, Matrix& target) const {
- _aggregate(axis, target, &_max, -MTYPE_MAX);
-}
-
-MTYPE Matrix::sum() const {
- return _aggregate(&_add, 0);
-}
-
-MTYPE Matrix::norm() const {
- return sqrt(norm2());
-}
-
-MTYPE Matrix::norm2() const {
- return _aggregate(&_addSquare, 0);
-}
-
-Matrix& Matrix::sum(int64 axis) const {
- Matrix& target = axis == 0 ? *new Matrix(1, this->_numCols) : *new Matrix(this->_numRows, 1);
- this->sum(axis, target);
- return target;
-}
-
-void Matrix::sum(int64 axis, Matrix& target) const {
- _aggregate(axis, target, &_add, 0);
-}
-
-void Matrix::_aggregate(int64 axis, Matrix& target, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
- if (axis == 0) {
- target.resize(1, this->_numCols);
- for (int64 j = 0; j < this->_numCols; j++) {
- target(0, j) = _aggregateCol(j, agg_func, initialValue);
- }
- } else {
- target.resize(this->_numRows, 1);
- for (int64 i = 0; i < this->_numRows; i++) {
- target(i, 0) = _aggregateRow(i, agg_func, initialValue);
- }
- }
-}
-
-MTYPE Matrix::_aggregateRow(int64 row, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
- MTYPE v = initialValue;
- for (int64 j = 0; j < this->_numCols; j++) {
- v = agg_func((*this)(row, j), v);
- }
- return v;
-}
-
-MTYPE Matrix::_aggregateCol(int64 col, MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
- MTYPE v = initialValue;
- for (int64 i = 0; i < this->_numRows; i++) {
- v = agg_func((*this)(i, col), v);
- }
- return v;
-}
-
-MTYPE Matrix::_aggregate(MTYPE (*agg_func)(MTYPE, MTYPE), MTYPE initialValue) const {
- MTYPE v = initialValue;
- MTYPE* ptr = _data;
- for (int64 i = 0; i < getNumElements(); i++, ptr++) {
- v = agg_func(*ptr, v);
- }
- return v;
-}
-
-void Matrix::printShape(const char* name) const {
- printf("%s: %lldx%lld\n", name, getNumRows(), getNumCols());
-}
-
-void Matrix::print() const {
- print(0,getNumRows(),0, getNumCols());
-}
-
-void Matrix::print(int64 rows, int64 cols) const {
- print(0,rows,0, cols);
-}
-
-void Matrix::print(int64 startRow, int64 rows, int64 startCol, int64 cols) const {
- for (int64 i = startRow; i < std::min(startRow+rows, this->_numRows); i++) {
- for (int64 j = startCol; j < std::min(startCol+cols, this->_numCols); j++) {
- printf("%.15f ", (*this)(i, j));
- }
- printf("\n");
- }
-}
-
-void Matrix::apply(Matrix::FUNCTION f) {
- apply(f, *this);
-}
-
-
-void Matrix::apply(Matrix::FUNCTION f, Matrix& target) {
- MTYPE (*func)(MTYPE);
- if(f == EXP) {
- func = &_exp;
- } else if(f == TANH) {
- func = &_tanh;
- } else if(f == RECIPROCAL) {
- func = &_recip;
- } else if (f == SQUARE) {
- func = &_square;
- } else if(f == LOG) {
- func = &_log;
- } else if(f == ZERO) {
- func = &_zero;
- } else if (f == ONE) {
- func = &_one;
- } else if(f == LOGISTIC1) {
- func = &_sigma1;
- } else if(f == LOGISTIC2) {
- func = &_sigma2;
- } else if (f == ABS) {
- func = &_abs;
- } else if (f == SIGN) {
- func = &_sign;
- } else {
- return;
- //LOG(FATAL) << "Matrix::apply: Unknown function type";
- }
- this->_applyLoop(func, target);
-}
-
-void Matrix::eltWiseMult(const Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- this->_applyLoop2(a, &_mult, target);
-}
-
-void Matrix::eltWiseDivide(const Matrix& a, Matrix& target) const {
- assert(isSameDims(a));
- target.resize(*this);
- this->_applyLoop2(a, &_divide, target);
-}
-
-void Matrix::eltWiseMult(const Matrix& a) {
- eltWiseMult(a, *this);
-}
-
-void Matrix::eltWiseDivide(const Matrix& a) {
- eltWiseDivide(a, *this);
-}
-
-void Matrix::randomizeUniform() {
- this->_applyLoop(&_rand);
-}
-
-void Matrix::randomizeNormal() {
- //LOG(FATAL) << "randomizeNormal only implemented on MKL!";
-}
-
-void Matrix::randomizeNormal(MTYPE /*mean*/, MTYPE /*stdev*/) {
- // LOG(FATAL) << "randomizeNormal only implemented on MKL!";
-}
-
-void Matrix::eltWiseDivideByVector(const Matrix& vec) {
- eltWiseDivideByVector(vec, *this);
-}
-
-/* This function allocates a chunk of memory at most as big as the input vector */
-void Matrix::eltWiseDivideByVector(const Matrix& vec, Matrix& target) {
- assert(std::min(vec.getNumCols(), vec.getNumRows()) == 1);
- const bool rowVector = vec.getNumRows() == 1;
- assert((rowVector && vec.getNumCols() == getNumCols()) || (!rowVector && vec.getNumRows() == getNumRows()));
- if(&target != this) {
- target.resize(*this);
- }
- _divideByVector(vec, target);
-}
-
-void Matrix::_divideByVector(const Matrix& vec, Matrix& target) {
- Matrix& vecInverse = vec.copy();
- vecInverse.apply(RECIPROCAL);
- eltWiseMultByVector(vecInverse,target);
- delete &vecInverse;
-}
-
-void Matrix::reshape(int64 numRows, int64 numCols) {
- assert(_numElements == numRows*numCols);
- _numRows = numRows;
- _numCols = numCols;
-}
-
-Matrix& Matrix::reshaped(int64 numRows, int64 numCols) {
- assert(_numElements == numRows*numCols);
- return *new Matrix(_data, numRows, numCols, isTrans());
-}
-
-void Matrix::_applyLoop(MTYPE (*func)(MTYPE), Matrix& target) {
- MTYPE *ptr = this->_data, *tgtPtr = target._data;
- for (int64 i = 0; i < getNumElements(); i++, ptr++, tgtPtr++) {
- *tgtPtr = (*func)(*ptr);
- }
-}
-
-void Matrix::_applyLoop(MTYPE (*func)(MTYPE)) {
- _applyLoop(func, *this);
-}
-
-void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE), Matrix& target) const {
- for (int64 i = 0; i < getNumRows(); i++) {
- for (int64 j = 0; j < getNumCols(); j++) {
- target(i, j) = (*func)((*this)(i, j), a(i, j));
- }
- }
-}
-
-void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE), MTYPE scalar, Matrix& target) const {
- for (int64 i = 0; i < getNumRows(); i++) {
- for (int64 j = 0; j < getNumCols(); j++) {
- target(i, j) = (*func)((*this)(i, j), a(i, j), scalar);
- }
- }
-}
-
-void Matrix::_applyLoop2(const Matrix& a, MTYPE (*func)(MTYPE,MTYPE, MTYPE, MTYPE), MTYPE scalar1, MTYPE scalar2, Matrix& target) const {
- for (int64 i = 0; i < getNumRows(); i++) {
- for (int64 j = 0; j < getNumCols(); j++) {
- target(i, j) = (*func)((*this)(i, j), a(i, j), scalar1, scalar2);
- }
- }
-}
-
-void Matrix::_applyLoopScalar(const MTYPE scalar, MTYPE(*func)(MTYPE, MTYPE), Matrix& target) const {
- MTYPE *myPtr = _data;
- MTYPE *targetPtr = target._data;
- for (int64 i = 0; i < getNumElements(); i++, myPtr++, targetPtr++) {
- *targetPtr = (*func)(*myPtr, scalar);
- }
-}
-
-bool Matrix::hasNan() const {
- for (int64 r = 0; r < _numRows; r++) {
- for (int64 c = 0; c < _numCols; c++) {
- if (isnan((*this)(r,c))) {
- return true;
- }
- }
- }
- return false;
-}
-
-bool Matrix::hasInf() const {
- for (int64 r = 0; r < _numRows; r++) {
- for (int64 c = 0; c < _numCols; c++) {
- if (isinf((*this)(r,c))) {
- return true;
- }
- }
- }
- return false;
-}
-
-