Test maximum compute work group sizes
[platform/upstream/VK-GL-CTS.git] / external / vulkancts / modules / vulkan / compute / vktComputeBasicComputeShaderTests.cpp
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  *//*!
21  * \file
22  * \brief Compute Shader Tests
23  *//*--------------------------------------------------------------------*/
24
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30
31 #include "vkDefs.hpp"
32 #include "vkRef.hpp"
33 #include "vkRefUtil.hpp"
34 #include "vkPlatform.hpp"
35 #include "vkPrograms.hpp"
36 #include "vkRefUtil.hpp"
37 #include "vkMemUtil.hpp"
38 #include "vkBarrierUtil.hpp"
39 #include "vkQueryUtil.hpp"
40 #include "vkBuilderUtil.hpp"
41 #include "vkTypeUtil.hpp"
42 #include "vkDeviceUtil.hpp"
43 #include "vkCmdUtil.hpp"
44 #include "vkObjUtil.hpp"
45 #include "vkBufferWithMemory.hpp"
46
47 #include "tcuCommandLine.hpp"
48 #include "tcuTestLog.hpp"
49
50 #include "deStringUtil.hpp"
51 #include "deUniquePtr.hpp"
52 #include "deRandom.hpp"
53
54 #include <vector>
55 #include <memory>
56
57 using namespace vk;
58
59 namespace vkt
60 {
61 namespace compute
62 {
63 namespace
64 {
65
66 template<typename T, int size>
67 T multiplyComponents (const tcu::Vector<T, size>& v)
68 {
69         T accum = 1;
70         for (int i = 0; i < size; ++i)
71                 accum *= v[i];
72         return accum;
73 }
74
75 template<typename T>
76 inline T squared (const T& a)
77 {
78         return a * a;
79 }
80
81 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
82 {
83         const VkImageCreateInfo imageParams =
84         {
85                 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,                            // VkStructureType                      sType;
86                 DE_NULL,                                                                                        // const void*                          pNext;
87                 0u,                                                                                                     // VkImageCreateFlags           flags;
88                 VK_IMAGE_TYPE_2D,                                                                       // VkImageType                          imageType;
89                 VK_FORMAT_R32_UINT,                                                                     // VkFormat                                     format;
90                 vk::makeExtent3D(imageSize.x(), imageSize.y(), 1),      // VkExtent3D                           extent;
91                 1u,                                                                                                     // deUint32                                     mipLevels;
92                 1u,                                                                                                     // deUint32                                     arrayLayers;
93                 VK_SAMPLE_COUNT_1_BIT,                                                          // VkSampleCountFlagBits        samples;
94                 VK_IMAGE_TILING_OPTIMAL,                                                        // VkImageTiling                        tiling;
95                 usage,                                                                                          // VkImageUsageFlags            usage;
96                 VK_SHARING_MODE_EXCLUSIVE,                                                      // VkSharingMode                        sharingMode;
97                 0u,                                                                                                     // deUint32                                     queueFamilyIndexCount;
98                 DE_NULL,                                                                                        // const deUint32*                      pQueueFamilyIndices;
99                 VK_IMAGE_LAYOUT_UNDEFINED,                                                      // VkImageLayout                        initialLayout;
100         };
101         return imageParams;
102 }
103
104 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
105 {
106         return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
107 }
108
109 enum BufferType
110 {
111         BUFFER_TYPE_UNIFORM,
112         BUFFER_TYPE_SSBO,
113 };
114
115 class SharedVarTest : public vkt::TestCase
116 {
117 public:
118                                                 SharedVarTest   (tcu::TestContext&              testCtx,
119                                                                                  const std::string&             name,
120                                                                                  const std::string&             description,
121                                                                                  const tcu::IVec3&              localSize,
122                                                                                  const tcu::IVec3&              workSize);
123
124         void                            initPrograms    (SourceCollections&             sourceCollections) const;
125         TestInstance*           createInstance  (Context&                               context) const;
126
127 private:
128         const tcu::IVec3        m_localSize;
129         const tcu::IVec3        m_workSize;
130 };
131
132 class SharedVarTestInstance : public vkt::TestInstance
133 {
134 public:
135                                                                         SharedVarTestInstance   (Context&                       context,
136                                                                                                                          const tcu::IVec3&      localSize,
137                                                                                                                          const tcu::IVec3&      workSize);
138
139         tcu::TestStatus                                 iterate                                 (void);
140
141 private:
142         const tcu::IVec3                                m_localSize;
143         const tcu::IVec3                                m_workSize;
144 };
145
146 SharedVarTest::SharedVarTest (tcu::TestContext&         testCtx,
147                                                           const std::string&    name,
148                                                           const std::string&    description,
149                                                           const tcu::IVec3&             localSize,
150                                                           const tcu::IVec3&             workSize)
151         : TestCase              (testCtx, name, description)
152         , m_localSize   (localSize)
153         , m_workSize    (workSize)
154 {
155 }
156
157 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
158 {
159         const int workGroupSize = multiplyComponents(m_localSize);
160         const int workGroupCount = multiplyComponents(m_workSize);
161         const int numValues = workGroupSize * workGroupCount;
162
163         std::ostringstream src;
164         src << "#version 310 es\n"
165                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
166                 << "layout(binding = 0) writeonly buffer Output {\n"
167                 << "    uint values[" << numValues << "];\n"
168                 << "} sb_out;\n\n"
169                 << "shared uint offsets[" << workGroupSize << "];\n\n"
170                 << "void main (void) {\n"
171                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
172                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
173                 << "    uint globalOffs = localSize*globalNdx;\n"
174                 << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
175                 << "\n"
176                 << "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
177                 << "    memoryBarrierShared();\n"
178                 << "    barrier();\n"
179                 << "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
180                 << "}\n";
181
182         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
183 }
184
185 TestInstance* SharedVarTest::createInstance (Context& context) const
186 {
187         return new SharedVarTestInstance(context, m_localSize, m_workSize);
188 }
189
190 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
191         : TestInstance  (context)
192         , m_localSize   (localSize)
193         , m_workSize    (workSize)
194 {
195 }
196
197 tcu::TestStatus SharedVarTestInstance::iterate (void)
198 {
199         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
200         const VkDevice                  device                          = m_context.getDevice();
201         const VkQueue                   queue                           = m_context.getUniversalQueue();
202         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
203         Allocator&                              allocator                       = m_context.getDefaultAllocator();
204
205         const int workGroupSize = multiplyComponents(m_localSize);
206         const int workGroupCount = multiplyComponents(m_workSize);
207
208         // Create a buffer and host-visible memory for it
209
210         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
211         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
212
213         // Create descriptor set
214
215         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
216                 DescriptorSetLayoutBuilder()
217                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
218                 .build(vk, device));
219
220         const Unique<VkDescriptorPool> descriptorPool(
221                 DescriptorPoolBuilder()
222                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
223                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
224
225         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
226
227         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
228         DescriptorSetUpdateBuilder()
229                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
230                 .update(vk, device);
231
232         // Perform the computation
233
234         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
235         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
236         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
237
238         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
239
240         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
241         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
242
243         // Start recording commands
244
245         beginCommandBuffer(vk, *cmdBuffer);
246
247         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
248         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
249
250         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
251
252         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
253
254         endCommandBuffer(vk, *cmdBuffer);
255
256         // Wait for completion
257
258         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
259
260         // Validate the results
261
262         const Allocation& bufferAllocation = buffer.getAllocation();
263         invalidateAlloc(vk, device, bufferAllocation);
264
265         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
266
267         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
268         {
269                 const int globalOffset = groupNdx * workGroupSize;
270                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
271                 {
272                         const deUint32 res = bufferPtr[globalOffset + localOffset];
273                         const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
274
275                         if (res != ref)
276                         {
277                                 std::ostringstream msg;
278                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
279                                 return tcu::TestStatus::fail(msg.str());
280                         }
281                 }
282         }
283         return tcu::TestStatus::pass("Compute succeeded");
284 }
285
286 class SharedVarAtomicOpTest : public vkt::TestCase
287 {
288 public:
289                                                 SharedVarAtomicOpTest   (tcu::TestContext&      testCtx,
290                                                                                                  const std::string&     name,
291                                                                                                  const std::string&     description,
292                                                                                                  const tcu::IVec3&      localSize,
293                                                                                                  const tcu::IVec3&      workSize);
294
295         void                            initPrograms                    (SourceCollections& sourceCollections) const;
296         TestInstance*           createInstance                  (Context&                       context) const;
297
298 private:
299         const tcu::IVec3        m_localSize;
300         const tcu::IVec3        m_workSize;
301 };
302
303 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
304 {
305 public:
306                                                                         SharedVarAtomicOpTestInstance   (Context&                       context,
307                                                                                                                                          const tcu::IVec3&      localSize,
308                                                                                                                                          const tcu::IVec3&      workSize);
309
310         tcu::TestStatus                                 iterate                                                 (void);
311
312 private:
313         const tcu::IVec3                                m_localSize;
314         const tcu::IVec3                                m_workSize;
315 };
316
317 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext&         testCtx,
318                                                                                           const std::string&    name,
319                                                                                           const std::string&    description,
320                                                                                           const tcu::IVec3&             localSize,
321                                                                                           const tcu::IVec3&             workSize)
322         : TestCase              (testCtx, name, description)
323         , m_localSize   (localSize)
324         , m_workSize    (workSize)
325 {
326 }
327
328 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
329 {
330         const int workGroupSize = multiplyComponents(m_localSize);
331         const int workGroupCount = multiplyComponents(m_workSize);
332         const int numValues = workGroupSize * workGroupCount;
333
334         std::ostringstream src;
335         src << "#version 310 es\n"
336                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
337                 << "layout(binding = 0) writeonly buffer Output {\n"
338                 << "    uint values[" << numValues << "];\n"
339                 << "} sb_out;\n\n"
340                 << "shared uint count;\n\n"
341                 << "void main (void) {\n"
342                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
343                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
344                 << "    uint globalOffs = localSize*globalNdx;\n"
345                 << "\n"
346                 << "    count = 0u;\n"
347                 << "    memoryBarrierShared();\n"
348                 << "    barrier();\n"
349                 << "    uint oldVal = atomicAdd(count, 1u);\n"
350                 << "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
351                 << "}\n";
352
353         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
354 }
355
356 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
357 {
358         return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
359 }
360
361 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
362         : TestInstance  (context)
363         , m_localSize   (localSize)
364         , m_workSize    (workSize)
365 {
366 }
367
368 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
369 {
370         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
371         const VkDevice                  device                          = m_context.getDevice();
372         const VkQueue                   queue                           = m_context.getUniversalQueue();
373         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
374         Allocator&                              allocator                       = m_context.getDefaultAllocator();
375
376         const int workGroupSize = multiplyComponents(m_localSize);
377         const int workGroupCount = multiplyComponents(m_workSize);
378
379         // Create a buffer and host-visible memory for it
380
381         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
382         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
383
384         // Create descriptor set
385
386         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
387                 DescriptorSetLayoutBuilder()
388                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
389                 .build(vk, device));
390
391         const Unique<VkDescriptorPool> descriptorPool(
392                 DescriptorPoolBuilder()
393                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
394                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
395
396         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
397
398         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
399         DescriptorSetUpdateBuilder()
400                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
401                 .update(vk, device);
402
403         // Perform the computation
404
405         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
406         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
407         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
408
409         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
410
411         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
412         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
413
414         // Start recording commands
415
416         beginCommandBuffer(vk, *cmdBuffer);
417
418         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
419         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
420
421         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
422
423         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
424
425         endCommandBuffer(vk, *cmdBuffer);
426
427         // Wait for completion
428
429         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
430
431         // Validate the results
432
433         const Allocation& bufferAllocation = buffer.getAllocation();
434         invalidateAlloc(vk, device, bufferAllocation);
435
436         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
437
438         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
439         {
440                 const int globalOffset = groupNdx * workGroupSize;
441                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
442                 {
443                         const deUint32 res = bufferPtr[globalOffset + localOffset];
444                         const deUint32 ref = localOffset + 1;
445
446                         if (res != ref)
447                         {
448                                 std::ostringstream msg;
449                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
450                                 return tcu::TestStatus::fail(msg.str());
451                         }
452                 }
453         }
454         return tcu::TestStatus::pass("Compute succeeded");
455 }
456
457 class SSBOLocalBarrierTest : public vkt::TestCase
458 {
459 public:
460                                                 SSBOLocalBarrierTest    (tcu::TestContext&      testCtx,
461                                                                                                  const std::string& name,
462                                                                                                  const std::string&     description,
463                                                                                                  const tcu::IVec3&      localSize,
464                                                                                                  const tcu::IVec3&      workSize);
465
466         void                            initPrograms                    (SourceCollections& sourceCollections) const;
467         TestInstance*           createInstance                  (Context&                       context) const;
468
469 private:
470         const tcu::IVec3        m_localSize;
471         const tcu::IVec3        m_workSize;
472 };
473
474 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
475 {
476 public:
477                                                                         SSBOLocalBarrierTestInstance    (Context&                       context,
478                                                                                                                                          const tcu::IVec3&      localSize,
479                                                                                                                                          const tcu::IVec3&      workSize);
480
481         tcu::TestStatus                                 iterate                                                 (void);
482
483 private:
484         const tcu::IVec3                                m_localSize;
485         const tcu::IVec3                                m_workSize;
486 };
487
488 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext&   testCtx,
489                                                                                         const std::string&      name,
490                                                                                         const std::string&      description,
491                                                                                         const tcu::IVec3&       localSize,
492                                                                                         const tcu::IVec3&       workSize)
493         : TestCase              (testCtx, name, description)
494         , m_localSize   (localSize)
495         , m_workSize    (workSize)
496 {
497 }
498
499 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
500 {
501         const int workGroupSize = multiplyComponents(m_localSize);
502         const int workGroupCount = multiplyComponents(m_workSize);
503         const int numValues = workGroupSize * workGroupCount;
504
505         std::ostringstream src;
506         src << "#version 310 es\n"
507                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
508                 << "layout(binding = 0) coherent buffer Output {\n"
509                 << "    uint values[" << numValues << "];\n"
510                 << "} sb_out;\n\n"
511                 << "void main (void) {\n"
512                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
513                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
514                 << "    uint globalOffs = localSize*globalNdx;\n"
515                 << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
516                 << "\n"
517                 << "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
518                 << "    memoryBarrierBuffer();\n"
519                 << "    barrier();\n"
520                 << "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"         // += so we read and write
521                 << "    memoryBarrierBuffer();\n"
522                 << "    barrier();\n"
523                 << "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
524                 << "}\n";
525
526         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
527 }
528
529 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
530 {
531         return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
532 }
533
534 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
535         : TestInstance  (context)
536         , m_localSize   (localSize)
537         , m_workSize    (workSize)
538 {
539 }
540
541 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
542 {
543         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
544         const VkDevice                  device                          = m_context.getDevice();
545         const VkQueue                   queue                           = m_context.getUniversalQueue();
546         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
547         Allocator&                              allocator                       = m_context.getDefaultAllocator();
548
549         const int workGroupSize = multiplyComponents(m_localSize);
550         const int workGroupCount = multiplyComponents(m_workSize);
551
552         // Create a buffer and host-visible memory for it
553
554         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
555         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
556
557         // Create descriptor set
558
559         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
560                 DescriptorSetLayoutBuilder()
561                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
562                 .build(vk, device));
563
564         const Unique<VkDescriptorPool> descriptorPool(
565                 DescriptorPoolBuilder()
566                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
567                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
568
569         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
570
571         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
572         DescriptorSetUpdateBuilder()
573                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
574                 .update(vk, device);
575
576         // Perform the computation
577
578         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
579         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
580         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
581
582         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
583
584         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
585         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
586
587         // Start recording commands
588
589         beginCommandBuffer(vk, *cmdBuffer);
590
591         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
592         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
593
594         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
595
596         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
597
598         endCommandBuffer(vk, *cmdBuffer);
599
600         // Wait for completion
601
602         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
603
604         // Validate the results
605
606         const Allocation& bufferAllocation = buffer.getAllocation();
607         invalidateAlloc(vk, device, bufferAllocation);
608
609         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
610
611         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
612         {
613                 const int globalOffset = groupNdx * workGroupSize;
614                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
615                 {
616                         const deUint32  res             = bufferPtr[globalOffset + localOffset];
617                         const int               offs0   = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
618                         const int               offs1   = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
619                         const deUint32  ref             = static_cast<deUint32>(globalOffset + offs0 + offs1);
620
621                         if (res != ref)
622                         {
623                                 std::ostringstream msg;
624                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
625                                 return tcu::TestStatus::fail(msg.str());
626                         }
627                 }
628         }
629         return tcu::TestStatus::pass("Compute succeeded");
630 }
631
632 class CopyImageToSSBOTest : public vkt::TestCase
633 {
634 public:
635                                                 CopyImageToSSBOTest             (tcu::TestContext&      testCtx,
636                                                                                                  const std::string&     name,
637                                                                                                  const std::string&     description,
638                                                                                                  const tcu::IVec2&      localSize,
639                                                                                                  const tcu::IVec2&      imageSize);
640
641         void                            initPrograms                    (SourceCollections& sourceCollections) const;
642         TestInstance*           createInstance                  (Context&                       context) const;
643
644 private:
645         const tcu::IVec2        m_localSize;
646         const tcu::IVec2        m_imageSize;
647 };
648
649 class CopyImageToSSBOTestInstance : public vkt::TestInstance
650 {
651 public:
652                                                                         CopyImageToSSBOTestInstance             (Context&                       context,
653                                                                                                                                          const tcu::IVec2&      localSize,
654                                                                                                                                          const tcu::IVec2&      imageSize);
655
656         tcu::TestStatus                                 iterate                                                 (void);
657
658 private:
659         const tcu::IVec2                                m_localSize;
660         const tcu::IVec2                                m_imageSize;
661 };
662
663 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext&             testCtx,
664                                                                                   const std::string&    name,
665                                                                                   const std::string&    description,
666                                                                                   const tcu::IVec2&             localSize,
667                                                                                   const tcu::IVec2&             imageSize)
668         : TestCase              (testCtx, name, description)
669         , m_localSize   (localSize)
670         , m_imageSize   (imageSize)
671 {
672         DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
673         DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
674 }
675
676 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
677 {
678         std::ostringstream src;
679         src << "#version 310 es\n"
680                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
681                 << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
682                 << "layout(binding = 0) writeonly buffer Output {\n"
683                 << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
684                 << "} sb_out;\n\n"
685                 << "void main (void) {\n"
686                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
687                 << "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
688                 << "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
689                 << "}\n";
690
691         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
692 }
693
694 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
695 {
696         return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
697 }
698
699 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
700         : TestInstance  (context)
701         , m_localSize   (localSize)
702         , m_imageSize   (imageSize)
703 {
704 }
705
706 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
707 {
708         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
709         const VkDevice                  device                          = m_context.getDevice();
710         const VkQueue                   queue                           = m_context.getUniversalQueue();
711         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
712         Allocator&                              allocator                       = m_context.getDefaultAllocator();
713
714         // Create an image
715
716         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
717         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
718
719         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
720         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
721
722         // Staging buffer (source data for image)
723
724         const deUint32 imageArea = multiplyComponents(m_imageSize);
725         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
726
727         const Buffer stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
728
729         // Populate the staging buffer with test data
730         {
731                 de::Random rnd(0xab2c7);
732                 const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
733                 deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
734                 for (deUint32 i = 0; i < imageArea; ++i)
735                         *bufferPtr++ = rnd.getUint32();
736
737                 flushAlloc(vk, device, stagingBufferAllocation);
738         }
739
740         // Create a buffer to store shader output
741
742         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
743
744         // Create descriptor set
745
746         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
747                 DescriptorSetLayoutBuilder()
748                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
749                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
750                 .build(vk, device));
751
752         const Unique<VkDescriptorPool> descriptorPool(
753                 DescriptorPoolBuilder()
754                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
755                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
756                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
757
758         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
759
760         // Set the bindings
761
762         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
763         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
764
765         DescriptorSetUpdateBuilder()
766                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
767                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
768                 .update(vk, device);
769
770         // Perform the computation
771         {
772                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
773                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
774                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
775
776                 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
777                 const tcu::IVec2 workSize = m_imageSize / m_localSize;
778
779                 // Prepare the command buffer
780
781                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
782                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
783
784                 // Start recording commands
785
786                 beginCommandBuffer(vk, *cmdBuffer);
787
788                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
789                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
790
791                 const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
792                 copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
793
794                 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
795                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
796
797                 endCommandBuffer(vk, *cmdBuffer);
798
799                 // Wait for completion
800
801                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
802         }
803
804         // Validate the results
805
806         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
807         invalidateAlloc(vk, device, outputBufferAllocation);
808
809         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
810         const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
811
812         for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
813         {
814                 const deUint32 res = *(bufferPtr + ndx);
815                 const deUint32 ref = *(refBufferPtr + ndx);
816
817                 if (res != ref)
818                 {
819                         std::ostringstream msg;
820                         msg << "Comparison failed for Output.values[" << ndx << "]";
821                         return tcu::TestStatus::fail(msg.str());
822                 }
823         }
824         return tcu::TestStatus::pass("Compute succeeded");
825 }
826
827 class CopySSBOToImageTest : public vkt::TestCase
828 {
829 public:
830                                                 CopySSBOToImageTest     (tcu::TestContext&      testCtx,
831                                                                                          const std::string&     name,
832                                                                                          const std::string&     description,
833                                                                                          const tcu::IVec2&      localSize,
834                                                                                          const tcu::IVec2&      imageSize);
835
836         void                            initPrograms            (SourceCollections& sourceCollections) const;
837         TestInstance*           createInstance          (Context&                       context) const;
838
839 private:
840         const tcu::IVec2        m_localSize;
841         const tcu::IVec2        m_imageSize;
842 };
843
844 class CopySSBOToImageTestInstance : public vkt::TestInstance
845 {
846 public:
847                                                                         CopySSBOToImageTestInstance     (Context&                       context,
848                                                                                                                                  const tcu::IVec2&      localSize,
849                                                                                                                                  const tcu::IVec2&      imageSize);
850
851         tcu::TestStatus                                 iterate                                         (void);
852
853 private:
854         const tcu::IVec2                                m_localSize;
855         const tcu::IVec2                                m_imageSize;
856 };
857
858 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext&             testCtx,
859                                                                                   const std::string&    name,
860                                                                                   const std::string&    description,
861                                                                                   const tcu::IVec2&             localSize,
862                                                                                   const tcu::IVec2&             imageSize)
863         : TestCase              (testCtx, name, description)
864         , m_localSize   (localSize)
865         , m_imageSize   (imageSize)
866 {
867         DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
868         DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
869 }
870
871 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
872 {
873         std::ostringstream src;
874         src << "#version 310 es\n"
875                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
876                 << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
877                 << "layout(binding = 0) readonly buffer Input {\n"
878                 << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
879                 << "} sb_in;\n\n"
880                 << "void main (void) {\n"
881                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
882                 << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
883                 << "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
884                 << "}\n";
885
886         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
887 }
888
889 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
890 {
891         return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
892 }
893
894 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
895         : TestInstance  (context)
896         , m_localSize   (localSize)
897         , m_imageSize   (imageSize)
898 {
899 }
900
901 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
902 {
903         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
904         const VkDevice                  device                          = m_context.getDevice();
905         const VkQueue                   queue                           = m_context.getUniversalQueue();
906         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
907         Allocator&                              allocator                       = m_context.getDefaultAllocator();
908
909         // Create an image
910
911         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
912         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
913
914         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
915         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
916
917         // Create an input buffer (data to be read in the shader)
918
919         const deUint32 imageArea = multiplyComponents(m_imageSize);
920         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
921
922         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
923
924         // Populate the buffer with test data
925         {
926                 de::Random rnd(0x77238ac2);
927                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
928                 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
929                 for (deUint32 i = 0; i < imageArea; ++i)
930                         *bufferPtr++ = rnd.getUint32();
931
932                 flushAlloc(vk, device, inputBufferAllocation);
933         }
934
935         // Create a buffer to store shader output (copied from image data)
936
937         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
938
939         // Create descriptor set
940
941         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
942                 DescriptorSetLayoutBuilder()
943                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
944                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
945                 .build(vk, device));
946
947         const Unique<VkDescriptorPool> descriptorPool(
948                 DescriptorPoolBuilder()
949                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
950                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
951                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
952
953         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
954
955         // Set the bindings
956
957         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
958         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
959
960         DescriptorSetUpdateBuilder()
961                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
962                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
963                 .update(vk, device);
964
965         // Perform the computation
966         {
967                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
968                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
969                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
970
971                 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
972
973                 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
974                         0u, 0u,
975                         VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
976                         *image, subresourceRange);
977
978                 const tcu::IVec2 workSize = m_imageSize / m_localSize;
979
980                 // Prepare the command buffer
981
982                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
983                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
984
985                 // Start recording commands
986
987                 beginCommandBuffer(vk, *cmdBuffer);
988
989                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
990                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
991
992                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
993                 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
994
995                 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
996
997                 endCommandBuffer(vk, *cmdBuffer);
998
999                 // Wait for completion
1000
1001                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1002         }
1003
1004         // Validate the results
1005
1006         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1007         invalidateAlloc(vk, device, outputBufferAllocation);
1008
1009         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1010         const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1011
1012         for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1013         {
1014                 const deUint32 res = *(bufferPtr + ndx);
1015                 const deUint32 ref = *(refBufferPtr + ndx);
1016
1017                 if (res != ref)
1018                 {
1019                         std::ostringstream msg;
1020                         msg << "Comparison failed for pixel " << ndx;
1021                         return tcu::TestStatus::fail(msg.str());
1022                 }
1023         }
1024         return tcu::TestStatus::pass("Compute succeeded");
1025 }
1026
1027 class BufferToBufferInvertTest : public vkt::TestCase
1028 {
1029 public:
1030         void                                                            initPrograms                            (SourceCollections&     sourceCollections) const;
1031         TestInstance*                                           createInstance                          (Context&                       context) const;
1032
1033         static BufferToBufferInvertTest*        UBOToSSBOInvertCase                     (tcu::TestContext&      testCtx,
1034                                                                                                                                          const std::string& name,
1035                                                                                                                                          const std::string& description,
1036                                                                                                                                          const deUint32         numValues,
1037                                                                                                                                          const tcu::IVec3&      localSize,
1038                                                                                                                                          const tcu::IVec3&      workSize);
1039
1040         static BufferToBufferInvertTest*        CopyInvertSSBOCase                      (tcu::TestContext&      testCtx,
1041                                                                                                                                          const std::string& name,
1042                                                                                                                                          const std::string& description,
1043                                                                                                                                          const deUint32         numValues,
1044                                                                                                                                          const tcu::IVec3&      localSize,
1045                                                                                                                                          const tcu::IVec3&      workSize);
1046
1047 private:
1048                                                                                 BufferToBufferInvertTest        (tcu::TestContext&      testCtx,
1049                                                                                                                                          const std::string& name,
1050                                                                                                                                          const std::string& description,
1051                                                                                                                                          const deUint32         numValues,
1052                                                                                                                                          const tcu::IVec3&      localSize,
1053                                                                                                                                          const tcu::IVec3&      workSize,
1054                                                                                                                                          const BufferType       bufferType);
1055
1056         const BufferType                                        m_bufferType;
1057         const deUint32                                          m_numValues;
1058         const tcu::IVec3                                        m_localSize;
1059         const tcu::IVec3                                        m_workSize;
1060 };
1061
1062 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1063 {
1064 public:
1065                                                                         BufferToBufferInvertTestInstance        (Context&                       context,
1066                                                                                                                                                  const deUint32         numValues,
1067                                                                                                                                                  const tcu::IVec3&      localSize,
1068                                                                                                                                                  const tcu::IVec3&      workSize,
1069                                                                                                                                                  const BufferType       bufferType);
1070
1071         tcu::TestStatus                                 iterate                                                         (void);
1072
1073 private:
1074         const BufferType                                m_bufferType;
1075         const deUint32                                  m_numValues;
1076         const tcu::IVec3                                m_localSize;
1077         const tcu::IVec3                                m_workSize;
1078 };
1079
1080 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext&   testCtx,
1081                                                                                                         const std::string&      name,
1082                                                                                                         const std::string&      description,
1083                                                                                                         const deUint32          numValues,
1084                                                                                                         const tcu::IVec3&       localSize,
1085                                                                                                         const tcu::IVec3&       workSize,
1086                                                                                                         const BufferType        bufferType)
1087         : TestCase              (testCtx, name, description)
1088         , m_bufferType  (bufferType)
1089         , m_numValues   (numValues)
1090         , m_localSize   (localSize)
1091         , m_workSize    (workSize)
1092 {
1093         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1094         DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1095 }
1096
1097 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext&      testCtx,
1098                                                                                                                                                  const std::string&     name,
1099                                                                                                                                                  const std::string&     description,
1100                                                                                                                                                  const deUint32         numValues,
1101                                                                                                                                                  const tcu::IVec3&      localSize,
1102                                                                                                                                                  const tcu::IVec3&      workSize)
1103 {
1104         return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1105 }
1106
1107 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext&       testCtx,
1108                                                                                                                                                 const std::string&      name,
1109                                                                                                                                                 const std::string&      description,
1110                                                                                                                                                 const deUint32          numValues,
1111                                                                                                                                                 const tcu::IVec3&       localSize,
1112                                                                                                                                                 const tcu::IVec3&       workSize)
1113 {
1114         return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1115 }
1116
1117 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1118 {
1119         std::ostringstream src;
1120         if (m_bufferType == BUFFER_TYPE_UNIFORM)
1121         {
1122                 src << "#version 310 es\n"
1123                         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1124                         << "layout(binding = 0) readonly uniform Input {\n"
1125                         << "    uint values[" << m_numValues << "];\n"
1126                         << "} ub_in;\n"
1127                         << "layout(binding = 1, std140) writeonly buffer Output {\n"
1128                         << "    uint values[" << m_numValues << "];\n"
1129                         << "} sb_out;\n"
1130                         << "void main (void) {\n"
1131                         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1132                         << "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1133                         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1134                         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1135                         << "\n"
1136                         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1137                         << "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1138                         << "}\n";
1139         }
1140         else if (m_bufferType == BUFFER_TYPE_SSBO)
1141         {
1142                 src << "#version 310 es\n"
1143                         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1144                         << "layout(binding = 0, std140) readonly buffer Input {\n"
1145                         << "    uint values[" << m_numValues << "];\n"
1146                         << "} sb_in;\n"
1147                         << "layout (binding = 1, std140) writeonly buffer Output {\n"
1148                         << "    uint values[" << m_numValues << "];\n"
1149                         << "} sb_out;\n"
1150                         << "void main (void) {\n"
1151                         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1152                         << "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1153                         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1154                         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1155                         << "\n"
1156                         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1157                         << "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1158                         << "}\n";
1159         }
1160
1161         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1162 }
1163
1164 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1165 {
1166         return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1167 }
1168
1169 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context&                    context,
1170                                                                                                                                         const deUint32          numValues,
1171                                                                                                                                         const tcu::IVec3&       localSize,
1172                                                                                                                                         const tcu::IVec3&       workSize,
1173                                                                                                                                         const BufferType        bufferType)
1174         : TestInstance  (context)
1175         , m_bufferType  (bufferType)
1176         , m_numValues   (numValues)
1177         , m_localSize   (localSize)
1178         , m_workSize    (workSize)
1179 {
1180 }
1181
1182 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1183 {
1184         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1185         const VkDevice                  device                          = m_context.getDevice();
1186         const VkQueue                   queue                           = m_context.getUniversalQueue();
1187         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1188         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1189
1190         // Customize the test based on buffer type
1191
1192         const VkBufferUsageFlags inputBufferUsageFlags          = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1193         const VkDescriptorType inputBufferDescriptorType        = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1194         const deUint32 randomSeed                                                       = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1195
1196         // Create an input buffer
1197
1198         const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1199         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1200
1201         // Fill the input buffer with data
1202         {
1203                 de::Random rnd(randomSeed);
1204                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1205                 tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1206                 for (deUint32 i = 0; i < m_numValues; ++i)
1207                         bufferPtr[i].x() = rnd.getUint32();
1208
1209                 flushAlloc(vk, device, inputBufferAllocation);
1210         }
1211
1212         // Create an output buffer
1213
1214         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1215
1216         // Create descriptor set
1217
1218         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1219                 DescriptorSetLayoutBuilder()
1220                 .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1221                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1222                 .build(vk, device));
1223
1224         const Unique<VkDescriptorPool> descriptorPool(
1225                 DescriptorPoolBuilder()
1226                 .addType(inputBufferDescriptorType)
1227                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1228                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1229
1230         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1231
1232         const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1233         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1234         DescriptorSetUpdateBuilder()
1235                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1236                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1237                 .update(vk, device);
1238
1239         // Perform the computation
1240
1241         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1242         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1243         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1244
1245         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1246
1247         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1248
1249         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1250         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1251
1252         // Start recording commands
1253
1254         beginCommandBuffer(vk, *cmdBuffer);
1255
1256         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1257         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1258
1259         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1260         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1261         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1262
1263         endCommandBuffer(vk, *cmdBuffer);
1264
1265         // Wait for completion
1266
1267         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1268
1269         // Validate the results
1270
1271         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1272         invalidateAlloc(vk, device, outputBufferAllocation);
1273
1274         const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1275         const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1276
1277         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1278         {
1279                 const deUint32 res = bufferPtr[ndx].x();
1280                 const deUint32 ref = ~refBufferPtr[ndx].x();
1281
1282                 if (res != ref)
1283                 {
1284                         std::ostringstream msg;
1285                         msg << "Comparison failed for Output.values[" << ndx << "]";
1286                         return tcu::TestStatus::fail(msg.str());
1287                 }
1288         }
1289         return tcu::TestStatus::pass("Compute succeeded");
1290 }
1291
1292 class InvertSSBOInPlaceTest : public vkt::TestCase
1293 {
1294 public:
1295                                                 InvertSSBOInPlaceTest   (tcu::TestContext&      testCtx,
1296                                                                                                  const std::string&     name,
1297                                                                                                  const std::string&     description,
1298                                                                                                  const deUint32         numValues,
1299                                                                                                  const bool                     sized,
1300                                                                                                  const tcu::IVec3&      localSize,
1301                                                                                                  const tcu::IVec3&      workSize);
1302
1303
1304         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1305         TestInstance*           createInstance                  (Context&                       context) const;
1306
1307 private:
1308         const deUint32          m_numValues;
1309         const bool                      m_sized;
1310         const tcu::IVec3        m_localSize;
1311         const tcu::IVec3        m_workSize;
1312 };
1313
1314 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1315 {
1316 public:
1317                                                                         InvertSSBOInPlaceTestInstance   (Context&                       context,
1318                                                                                                                                          const deUint32         numValues,
1319                                                                                                                                          const tcu::IVec3&      localSize,
1320                                                                                                                                          const tcu::IVec3&      workSize);
1321
1322         tcu::TestStatus                                 iterate                                                 (void);
1323
1324 private:
1325         const deUint32                                  m_numValues;
1326         const tcu::IVec3                                m_localSize;
1327         const tcu::IVec3                                m_workSize;
1328 };
1329
1330 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext&         testCtx,
1331                                                                                           const std::string&    name,
1332                                                                                           const std::string&    description,
1333                                                                                           const deUint32                numValues,
1334                                                                                           const bool                    sized,
1335                                                                                           const tcu::IVec3&             localSize,
1336                                                                                           const tcu::IVec3&             workSize)
1337         : TestCase              (testCtx, name, description)
1338         , m_numValues   (numValues)
1339         , m_sized               (sized)
1340         , m_localSize   (localSize)
1341         , m_workSize    (workSize)
1342 {
1343         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1344 }
1345
1346 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1347 {
1348         std::ostringstream src;
1349         src << "#version 310 es\n"
1350                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1351                 << "layout(binding = 0) buffer InOut {\n"
1352                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1353                 << "} sb_inout;\n"
1354                 << "void main (void) {\n"
1355                 << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1356                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1357                 << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1358                 << "    uint offset          = numValuesPerInv*groupNdx;\n"
1359                 << "\n"
1360                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1361                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1362                 << "}\n";
1363
1364         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1365 }
1366
1367 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1368 {
1369         return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1370 }
1371
1372 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context&                  context,
1373                                                                                                                           const deUint32        numValues,
1374                                                                                                                           const tcu::IVec3&     localSize,
1375                                                                                                                           const tcu::IVec3&     workSize)
1376         : TestInstance  (context)
1377         , m_numValues   (numValues)
1378         , m_localSize   (localSize)
1379         , m_workSize    (workSize)
1380 {
1381 }
1382
1383 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1384 {
1385         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1386         const VkDevice                  device                          = m_context.getDevice();
1387         const VkQueue                   queue                           = m_context.getUniversalQueue();
1388         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1389         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1390
1391         // Create an input/output buffer
1392
1393         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1394         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1395
1396         // Fill the buffer with data
1397
1398         typedef std::vector<deUint32> data_vector_t;
1399         data_vector_t inputData(m_numValues);
1400
1401         {
1402                 de::Random rnd(0x82ce7f);
1403                 const Allocation& bufferAllocation = buffer.getAllocation();
1404                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1405                 for (deUint32 i = 0; i < m_numValues; ++i)
1406                         inputData[i] = *bufferPtr++ = rnd.getUint32();
1407
1408                 flushAlloc(vk, device, bufferAllocation);
1409         }
1410
1411         // Create descriptor set
1412
1413         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1414                 DescriptorSetLayoutBuilder()
1415                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1416                 .build(vk, device));
1417
1418         const Unique<VkDescriptorPool> descriptorPool(
1419                 DescriptorPoolBuilder()
1420                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1421                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1422
1423         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1424
1425         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1426         DescriptorSetUpdateBuilder()
1427                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1428                 .update(vk, device);
1429
1430         // Perform the computation
1431
1432         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1433         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1434         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1435
1436         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1437
1438         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1439
1440         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1441         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1442
1443         // Start recording commands
1444
1445         beginCommandBuffer(vk, *cmdBuffer);
1446
1447         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1448         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1449
1450         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1451         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1452         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1453
1454         endCommandBuffer(vk, *cmdBuffer);
1455
1456         // Wait for completion
1457
1458         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1459
1460         // Validate the results
1461
1462         const Allocation& bufferAllocation = buffer.getAllocation();
1463         invalidateAlloc(vk, device, bufferAllocation);
1464
1465         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1466
1467         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1468         {
1469                 const deUint32 res = bufferPtr[ndx];
1470                 const deUint32 ref = ~inputData[ndx];
1471
1472                 if (res != ref)
1473                 {
1474                         std::ostringstream msg;
1475                         msg << "Comparison failed for InOut.values[" << ndx << "]";
1476                         return tcu::TestStatus::fail(msg.str());
1477                 }
1478         }
1479         return tcu::TestStatus::pass("Compute succeeded");
1480 }
1481
1482 class WriteToMultipleSSBOTest : public vkt::TestCase
1483 {
1484 public:
1485                                                 WriteToMultipleSSBOTest (tcu::TestContext&      testCtx,
1486                                                                                                  const std::string&     name,
1487                                                                                                  const std::string&     description,
1488                                                                                                  const deUint32         numValues,
1489                                                                                                  const bool                     sized,
1490                                                                                                  const tcu::IVec3&      localSize,
1491                                                                                                  const tcu::IVec3&      workSize);
1492
1493         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1494         TestInstance*           createInstance                  (Context&                       context) const;
1495
1496 private:
1497         const deUint32          m_numValues;
1498         const bool                      m_sized;
1499         const tcu::IVec3        m_localSize;
1500         const tcu::IVec3        m_workSize;
1501 };
1502
1503 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1504 {
1505 public:
1506                                                                         WriteToMultipleSSBOTestInstance (Context&                       context,
1507                                                                                                                                          const deUint32         numValues,
1508                                                                                                                                          const tcu::IVec3&      localSize,
1509                                                                                                                                          const tcu::IVec3&      workSize);
1510
1511         tcu::TestStatus                                 iterate                                                 (void);
1512
1513 private:
1514         const deUint32                                  m_numValues;
1515         const tcu::IVec3                                m_localSize;
1516         const tcu::IVec3                                m_workSize;
1517 };
1518
1519 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext&             testCtx,
1520                                                                                                   const std::string&    name,
1521                                                                                                   const std::string&    description,
1522                                                                                                   const deUint32                numValues,
1523                                                                                                   const bool                    sized,
1524                                                                                                   const tcu::IVec3&             localSize,
1525                                                                                                   const tcu::IVec3&             workSize)
1526         : TestCase              (testCtx, name, description)
1527         , m_numValues   (numValues)
1528         , m_sized               (sized)
1529         , m_localSize   (localSize)
1530         , m_workSize    (workSize)
1531 {
1532         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1533 }
1534
1535 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1536 {
1537         std::ostringstream src;
1538         src << "#version 310 es\n"
1539                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1540                 << "layout(binding = 0) writeonly buffer Out0 {\n"
1541                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1542                 << "} sb_out0;\n"
1543                 << "layout(binding = 1) writeonly buffer Out1 {\n"
1544                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1545                 << "} sb_out1;\n"
1546                 << "void main (void) {\n"
1547                 << "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1548                 << "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1549                 << "\n"
1550                 << "    {\n"
1551                 << "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1552                 << "        uint offset          = numValuesPerInv*groupNdx;\n"
1553                 << "\n"
1554                 << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1555                 << "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1556                 << "    }\n"
1557                 << "    {\n"
1558                 << "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1559                 << "        uint offset          = numValuesPerInv*groupNdx;\n"
1560                 << "\n"
1561                 << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1562                 << "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1563                 << "    }\n"
1564                 << "}\n";
1565
1566         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1567 }
1568
1569 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1570 {
1571         return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1572 }
1573
1574 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context&                      context,
1575                                                                                                                                   const deUint32        numValues,
1576                                                                                                                                   const tcu::IVec3&     localSize,
1577                                                                                                                                   const tcu::IVec3&     workSize)
1578         : TestInstance  (context)
1579         , m_numValues   (numValues)
1580         , m_localSize   (localSize)
1581         , m_workSize    (workSize)
1582 {
1583 }
1584
1585 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1586 {
1587         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1588         const VkDevice                  device                          = m_context.getDevice();
1589         const VkQueue                   queue                           = m_context.getUniversalQueue();
1590         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1591         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1592
1593         // Create two output buffers
1594
1595         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1596         const Buffer buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1597         const Buffer buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1598
1599         // Create descriptor set
1600
1601         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1602                 DescriptorSetLayoutBuilder()
1603                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1604                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1605                 .build(vk, device));
1606
1607         const Unique<VkDescriptorPool> descriptorPool(
1608                 DescriptorPoolBuilder()
1609                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1610                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1611
1612         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1613
1614         const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1615         const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1616         DescriptorSetUpdateBuilder()
1617                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1618                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1619                 .update(vk, device);
1620
1621         // Perform the computation
1622
1623         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1624         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1625         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1626
1627         const VkBufferMemoryBarrier shaderWriteBarriers[] =
1628         {
1629                 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1630                 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1631         };
1632
1633         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1634         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1635
1636         // Start recording commands
1637
1638         beginCommandBuffer(vk, *cmdBuffer);
1639
1640         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1641         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1642
1643         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1644         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1645
1646         endCommandBuffer(vk, *cmdBuffer);
1647
1648         // Wait for completion
1649
1650         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1651
1652         // Validate the results
1653         {
1654                 const Allocation& buffer0Allocation = buffer0.getAllocation();
1655                 invalidateAlloc(vk, device, buffer0Allocation);
1656                 const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1657
1658                 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1659                 {
1660                         const deUint32 res = buffer0Ptr[ndx];
1661                         const deUint32 ref = ndx;
1662
1663                         if (res != ref)
1664                         {
1665                                 std::ostringstream msg;
1666                                 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1667                                 return tcu::TestStatus::fail(msg.str());
1668                         }
1669                 }
1670         }
1671         {
1672                 const Allocation& buffer1Allocation = buffer1.getAllocation();
1673                 invalidateAlloc(vk, device, buffer1Allocation);
1674                 const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1675
1676                 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1677                 {
1678                         const deUint32 res = buffer1Ptr[ndx];
1679                         const deUint32 ref = m_numValues - ndx;
1680
1681                         if (res != ref)
1682                         {
1683                                 std::ostringstream msg;
1684                                 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1685                                 return tcu::TestStatus::fail(msg.str());
1686                         }
1687                 }
1688         }
1689         return tcu::TestStatus::pass("Compute succeeded");
1690 }
1691
1692 class SSBOBarrierTest : public vkt::TestCase
1693 {
1694 public:
1695                                                 SSBOBarrierTest         (tcu::TestContext&      testCtx,
1696                                                                                          const std::string&     name,
1697                                                                                          const std::string&     description,
1698                                                                                          const tcu::IVec3&      workSize);
1699
1700         void                            initPrograms            (SourceCollections& sourceCollections) const;
1701         TestInstance*           createInstance          (Context&                       context) const;
1702
1703 private:
1704         const tcu::IVec3        m_workSize;
1705 };
1706
1707 class SSBOBarrierTestInstance : public vkt::TestInstance
1708 {
1709 public:
1710                                                                         SSBOBarrierTestInstance         (Context&                       context,
1711                                                                                                                                  const tcu::IVec3&      workSize);
1712
1713         tcu::TestStatus                                 iterate                                         (void);
1714
1715 private:
1716         const tcu::IVec3                                m_workSize;
1717 };
1718
1719 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext&             testCtx,
1720                                                                   const std::string&    name,
1721                                                                   const std::string&    description,
1722                                                                   const tcu::IVec3&             workSize)
1723         : TestCase              (testCtx, name, description)
1724         , m_workSize    (workSize)
1725 {
1726 }
1727
1728 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1729 {
1730         sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1731                 "#version 310 es\n"
1732                 "layout (local_size_x = 1) in;\n"
1733                 "layout(binding = 2) readonly uniform Constants {\n"
1734                 "    uint u_baseVal;\n"
1735                 "};\n"
1736                 "layout(binding = 1) writeonly buffer Output {\n"
1737                 "    uint values[];\n"
1738                 "};\n"
1739                 "void main (void) {\n"
1740                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1741                 "    values[offset] = u_baseVal + offset;\n"
1742                 "}\n");
1743
1744         sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1745                 "#version 310 es\n"
1746                 "layout (local_size_x = 1) in;\n"
1747                 "layout(binding = 1) readonly buffer Input {\n"
1748                 "    uint values[];\n"
1749                 "};\n"
1750                 "layout(binding = 0) coherent buffer Output {\n"
1751                 "    uint sum;\n"
1752                 "};\n"
1753                 "void main (void) {\n"
1754                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1755                 "    uint value  = values[offset];\n"
1756                 "    atomicAdd(sum, value);\n"
1757                 "}\n");
1758 }
1759
1760 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1761 {
1762         return new SSBOBarrierTestInstance(context, m_workSize);
1763 }
1764
1765 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1766         : TestInstance  (context)
1767         , m_workSize    (workSize)
1768 {
1769 }
1770
1771 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1772 {
1773         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1774         const VkDevice                  device                          = m_context.getDevice();
1775         const VkQueue                   queue                           = m_context.getUniversalQueue();
1776         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1777         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1778
1779         // Create a work buffer used by both shaders
1780
1781         const int workGroupCount = multiplyComponents(m_workSize);
1782         const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1783         const Buffer workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1784
1785         // Create an output buffer
1786
1787         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1788         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1789
1790         // Initialize atomic counter value to zero
1791         {
1792                 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1793                 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1794                 *outputBufferPtr = 0;
1795                 flushAlloc(vk, device, outputBufferAllocation);
1796         }
1797
1798         // Create a uniform buffer (to pass uniform constants)
1799
1800         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1801         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1802
1803         // Set the constants in the uniform buffer
1804
1805         const deUint32  baseValue = 127;
1806         {
1807                 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1808                 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1809                 uniformBufferPtr[0] = baseValue;
1810
1811                 flushAlloc(vk, device, uniformBufferAllocation);
1812         }
1813
1814         // Create descriptor set
1815
1816         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1817                 DescriptorSetLayoutBuilder()
1818                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1819                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1820                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1821                 .build(vk, device));
1822
1823         const Unique<VkDescriptorPool> descriptorPool(
1824                 DescriptorPoolBuilder()
1825                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1826                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1827                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1828
1829         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1830
1831         const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1832         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1833         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1834         DescriptorSetUpdateBuilder()
1835                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1836                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1837                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1838                 .update(vk, device);
1839
1840         // Perform the computation
1841
1842         const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1843         const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1844
1845         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1846         const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1847         const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1848
1849         const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1850
1851         const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1852
1853         const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1854
1855         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1856         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1857
1858         // Start recording commands
1859
1860         beginCommandBuffer(vk, *cmdBuffer);
1861
1862         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1863         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1864
1865         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1866
1867         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1868         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1869
1870         // Switch to the second shader program
1871         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1872
1873         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1874         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1875
1876         endCommandBuffer(vk, *cmdBuffer);
1877
1878         // Wait for completion
1879
1880         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1881
1882         // Validate the results
1883
1884         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1885         invalidateAlloc(vk, device, outputBufferAllocation);
1886
1887         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1888         const deUint32  res = *bufferPtr;
1889         deUint32                ref = 0;
1890
1891         for (int ndx = 0; ndx < workGroupCount; ++ndx)
1892                 ref += baseValue + ndx;
1893
1894         if (res != ref)
1895         {
1896                 std::ostringstream msg;
1897                 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1898                 return tcu::TestStatus::fail(msg.str());
1899         }
1900         return tcu::TestStatus::pass("Compute succeeded");
1901 }
1902
1903 class ImageAtomicOpTest : public vkt::TestCase
1904 {
1905 public:
1906                                                 ImageAtomicOpTest               (tcu::TestContext&      testCtx,
1907                                                                                                  const std::string& name,
1908                                                                                                  const std::string& description,
1909                                                                                                  const deUint32         localSize,
1910                                                                                                  const tcu::IVec2&      imageSize);
1911
1912         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1913         TestInstance*           createInstance                  (Context&                       context) const;
1914
1915 private:
1916         const deUint32          m_localSize;
1917         const tcu::IVec2        m_imageSize;
1918 };
1919
1920 class ImageAtomicOpTestInstance : public vkt::TestInstance
1921 {
1922 public:
1923                                                                         ImageAtomicOpTestInstance               (Context&                       context,
1924                                                                                                                                          const deUint32         localSize,
1925                                                                                                                                          const tcu::IVec2&      imageSize);
1926
1927         tcu::TestStatus                                 iterate                                                 (void);
1928
1929 private:
1930         const deUint32                                  m_localSize;
1931         const tcu::IVec2                                m_imageSize;
1932 };
1933
1934 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext&         testCtx,
1935                                                                           const std::string&    name,
1936                                                                           const std::string&    description,
1937                                                                           const deUint32                localSize,
1938                                                                           const tcu::IVec2&             imageSize)
1939         : TestCase              (testCtx, name, description)
1940         , m_localSize   (localSize)
1941         , m_imageSize   (imageSize)
1942 {
1943 }
1944
1945 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1946 {
1947         std::ostringstream src;
1948         src << "#version 310 es\n"
1949                 << "#extension GL_OES_shader_image_atomic : require\n"
1950                 << "layout (local_size_x = " << m_localSize << ") in;\n"
1951                 << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1952                 << "layout(binding = 0) readonly buffer Input {\n"
1953                 << "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1954                 << "} sb_in;\n\n"
1955                 << "void main (void) {\n"
1956                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1957                 << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1958                 << "\n"
1959                 << "    if (gl_LocalInvocationIndex == 0u)\n"
1960                 << "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1961                 << "    memoryBarrierImage();\n"
1962                 << "    barrier();\n"
1963                 << "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1964                 << "}\n";
1965
1966         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1967 }
1968
1969 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1970 {
1971         return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1972 }
1973
1974 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1975         : TestInstance  (context)
1976         , m_localSize   (localSize)
1977         , m_imageSize   (imageSize)
1978 {
1979 }
1980
1981 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1982 {
1983         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1984         const VkDevice                  device                          = m_context.getDevice();
1985         const VkQueue                   queue                           = m_context.getUniversalQueue();
1986         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1987         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1988
1989         // Create an image
1990
1991         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1992         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1993
1994         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1995         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1996
1997         // Input buffer
1998
1999         const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2000         const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2001
2002         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2003
2004         // Populate the input buffer with test data
2005         {
2006                 de::Random rnd(0x77238ac2);
2007                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2008                 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2009                 for (deUint32 i = 0; i < numInputValues; ++i)
2010                         *bufferPtr++ = rnd.getUint32();
2011
2012                 flushAlloc(vk, device, inputBufferAllocation);
2013         }
2014
2015         // Create a buffer to store shader output (copied from image data)
2016
2017         const deUint32 imageArea = multiplyComponents(m_imageSize);
2018         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2019         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2020
2021         // Create descriptor set
2022
2023         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2024                 DescriptorSetLayoutBuilder()
2025                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2026                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2027                 .build(vk, device));
2028
2029         const Unique<VkDescriptorPool> descriptorPool(
2030                 DescriptorPoolBuilder()
2031                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2032                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2033                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2034
2035         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2036
2037         // Set the bindings
2038
2039         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2040         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2041
2042         DescriptorSetUpdateBuilder()
2043                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2044                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2045                 .update(vk, device);
2046
2047         // Perform the computation
2048         {
2049                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2050                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2051                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2052
2053                 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2054
2055                 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2056                         (VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2057                         VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2058                         *image, subresourceRange);
2059
2060                 // Prepare the command buffer
2061
2062                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2063                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2064
2065                 // Start recording commands
2066
2067                 beginCommandBuffer(vk, *cmdBuffer);
2068
2069                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2070                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2071
2072                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2073                 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2074
2075                 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2076
2077                 endCommandBuffer(vk, *cmdBuffer);
2078
2079                 // Wait for completion
2080
2081                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2082         }
2083
2084         // Validate the results
2085
2086         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2087         invalidateAlloc(vk, device, outputBufferAllocation);
2088
2089         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2090         const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2091
2092         for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2093         {
2094                 const deUint32  res = bufferPtr[pixelNdx];
2095                 deUint32                ref = 0;
2096
2097                 for (deUint32 offs = 0; offs < m_localSize; ++offs)
2098                         ref += refBufferPtr[pixelNdx * m_localSize + offs];
2099
2100                 if (res != ref)
2101                 {
2102                         std::ostringstream msg;
2103                         msg << "Comparison failed for pixel " << pixelNdx;
2104                         return tcu::TestStatus::fail(msg.str());
2105                 }
2106         }
2107         return tcu::TestStatus::pass("Compute succeeded");
2108 }
2109
2110 class ImageBarrierTest : public vkt::TestCase
2111 {
2112 public:
2113                                                 ImageBarrierTest        (tcu::TestContext&      testCtx,
2114                                                                                         const std::string&      name,
2115                                                                                         const std::string&      description,
2116                                                                                         const tcu::IVec2&       imageSize);
2117
2118         void                            initPrograms            (SourceCollections& sourceCollections) const;
2119         TestInstance*           createInstance          (Context&                       context) const;
2120
2121 private:
2122         const tcu::IVec2        m_imageSize;
2123 };
2124
2125 class ImageBarrierTestInstance : public vkt::TestInstance
2126 {
2127 public:
2128                                                                         ImageBarrierTestInstance        (Context&                       context,
2129                                                                                                                                  const tcu::IVec2&      imageSize);
2130
2131         tcu::TestStatus                                 iterate                                         (void);
2132
2133 private:
2134         const tcu::IVec2                                m_imageSize;
2135 };
2136
2137 ImageBarrierTest::ImageBarrierTest (tcu::TestContext&   testCtx,
2138                                                                         const std::string&      name,
2139                                                                         const std::string&      description,
2140                                                                         const tcu::IVec2&       imageSize)
2141         : TestCase              (testCtx, name, description)
2142         , m_imageSize   (imageSize)
2143 {
2144 }
2145
2146 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2147 {
2148         sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2149                 "#version 310 es\n"
2150                 "layout (local_size_x = 1) in;\n"
2151                 "layout(binding = 2) readonly uniform Constants {\n"
2152                 "    uint u_baseVal;\n"
2153                 "};\n"
2154                 "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2155                 "void main (void) {\n"
2156                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2157                 "    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2158                 "}\n");
2159
2160         sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2161                 "#version 310 es\n"
2162                 "layout (local_size_x = 1) in;\n"
2163                 "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2164                 "layout(binding = 0) coherent buffer Output {\n"
2165                 "    uint sum;\n"
2166                 "};\n"
2167                 "void main (void) {\n"
2168                 "    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2169                 "    atomicAdd(sum, value);\n"
2170                 "}\n");
2171 }
2172
2173 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2174 {
2175         return new ImageBarrierTestInstance(context, m_imageSize);
2176 }
2177
2178 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2179         : TestInstance  (context)
2180         , m_imageSize   (imageSize)
2181 {
2182 }
2183
2184 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2185 {
2186         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
2187         const VkDevice                  device                          = m_context.getDevice();
2188         const VkQueue                   queue                           = m_context.getUniversalQueue();
2189         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
2190         Allocator&                              allocator                       = m_context.getDefaultAllocator();
2191
2192         // Create an image used by both shaders
2193
2194         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2195         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2196
2197         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2198         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2199
2200         // Create an output buffer
2201
2202         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2203         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2204
2205         // Initialize atomic counter value to zero
2206         {
2207                 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2208                 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2209                 *outputBufferPtr = 0;
2210                 flushAlloc(vk, device, outputBufferAllocation);
2211         }
2212
2213         // Create a uniform buffer (to pass uniform constants)
2214
2215         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2216         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2217
2218         // Set the constants in the uniform buffer
2219
2220         const deUint32  baseValue = 127;
2221         {
2222                 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2223                 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2224                 uniformBufferPtr[0] = baseValue;
2225
2226                 flushAlloc(vk, device, uniformBufferAllocation);
2227         }
2228
2229         // Create descriptor set
2230
2231         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2232                 DescriptorSetLayoutBuilder()
2233                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2234                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2235                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2236                 .build(vk, device));
2237
2238         const Unique<VkDescriptorPool> descriptorPool(
2239                 DescriptorPoolBuilder()
2240                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2241                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2242                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2243                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2244
2245         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2246
2247         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2248         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2249         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2250         DescriptorSetUpdateBuilder()
2251                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2252                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2253                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2254                 .update(vk, device);
2255
2256         // Perform the computation
2257
2258         const Unique<VkShaderModule>    shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2259         const Unique<VkShaderModule>    shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2260
2261         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2262         const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2263         const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2264
2265         const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2266
2267         const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2268                 0u, 0u,
2269                 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2270                 *image, subresourceRange);
2271
2272         const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2273                 VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2274                 VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
2275                 *image, subresourceRange);
2276
2277         const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2278
2279         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2280         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2281
2282         // Start recording commands
2283
2284         beginCommandBuffer(vk, *cmdBuffer);
2285
2286         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2287         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2288
2289         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2290
2291         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2292         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2293
2294         // Switch to the second shader program
2295         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2296
2297         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2298         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2299
2300         endCommandBuffer(vk, *cmdBuffer);
2301
2302         // Wait for completion
2303
2304         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2305
2306         // Validate the results
2307
2308         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2309         invalidateAlloc(vk, device, outputBufferAllocation);
2310
2311         const int               numValues = multiplyComponents(m_imageSize);
2312         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2313         const deUint32  res = *bufferPtr;
2314         deUint32                ref = 0;
2315
2316         for (int ndx = 0; ndx < numValues; ++ndx)
2317                 ref += baseValue + ndx;
2318
2319         if (res != ref)
2320         {
2321                 std::ostringstream msg;
2322                 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2323                 return tcu::TestStatus::fail(msg.str());
2324         }
2325         return tcu::TestStatus::pass("Compute succeeded");
2326 }
2327
2328 class ComputeTestInstance : public vkt::TestInstance
2329 {
2330 public:
2331                 ComputeTestInstance             (Context& context)
2332                 : TestInstance                  (context)
2333                 , m_numPhysDevices              (1)
2334                 , m_queueFamilyIndex    (0)
2335         {
2336                 createDeviceGroup();
2337         }
2338
2339         void                                                    createDeviceGroup       (void);
2340         const vk::DeviceInterface&              getDeviceInterface      (void)                  { return *m_deviceDriver; }
2341         vk::VkInstance                                  getInstance                     (void)                  { return m_deviceGroupInstance; }
2342         vk::VkDevice                                    getDevice                       (void)                  { return *m_logicalDevice; }
2343         vk::VkPhysicalDevice                    getPhysicalDevice       (deUint32 i = 0){ return m_physicalDevices[i]; }
2344
2345 protected:
2346         deUint32                                                m_numPhysDevices;
2347         deUint32                                                m_queueFamilyIndex;
2348
2349 private:
2350         CustomInstance                                          m_deviceGroupInstance;
2351         vk::Move<vk::VkDevice>                          m_logicalDevice;
2352         std::vector<vk::VkPhysicalDevice>       m_physicalDevices;
2353         de::MovePtr<vk::DeviceDriver>           m_deviceDriver;
2354 };
2355
2356 void ComputeTestInstance::createDeviceGroup (void)
2357 {
2358         const tcu::CommandLine&                                                 cmdLine                                 = m_context.getTestContext().getCommandLine();
2359         const deUint32                                                                  devGroupIdx                             = cmdLine.getVKDeviceGroupId() - 1;
2360         const deUint32                                                                  physDeviceIdx                   = cmdLine.getVKDeviceId() - 1;
2361         const float                                                                             queuePriority                   = 1.0f;
2362         const std::vector<std::string>                                  requiredExtensions              (1, "VK_KHR_device_group_creation");
2363         m_deviceGroupInstance                                                                                                   = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2364         std::vector<VkPhysicalDeviceGroupProperties>    devGroupProperties              = enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2365         m_numPhysDevices                                                                                                                = devGroupProperties[devGroupIdx].physicalDeviceCount;
2366         std::vector<const char*>                                                deviceExtensions;
2367
2368         if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2369                 deviceExtensions.push_back("VK_KHR_device_group");
2370
2371         VkDeviceGroupDeviceCreateInfo                                   deviceGroupInfo                 =
2372         {
2373                 VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO_KHR,                                                          //stype
2374                 DE_NULL,                                                                                                                                                        //pNext
2375                 devGroupProperties[devGroupIdx].physicalDeviceCount,                                                            //physicalDeviceCount
2376                 devGroupProperties[devGroupIdx].physicalDevices                                                                         //physicalDevices
2377         };
2378         const InstanceDriver&                                                   instance                                (m_deviceGroupInstance.getDriver());
2379         const VkPhysicalDeviceFeatures                                  deviceFeatures                  = getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2380         const std::vector<VkQueueFamilyProperties>              queueProps                              = getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2381
2382         m_physicalDevices.resize(m_numPhysDevices);
2383         for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2384                 m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2385
2386         for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2387         {
2388                 if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2389                         m_queueFamilyIndex = (deUint32)queueNdx;
2390         }
2391
2392         VkDeviceQueueCreateInfo                                                 queueInfo                               =
2393         {
2394                 VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,             // VkStructureType                                      sType;
2395                 DE_NULL,                                                                                // const void*                                          pNext;
2396                 (VkDeviceQueueCreateFlags)0u,                                   // VkDeviceQueueCreateFlags                     flags;
2397                 m_queueFamilyIndex,                                                             // deUint32                                                     queueFamilyIndex;
2398                 1u,                                                                                             // deUint32                                                     queueCount;
2399                 &queuePriority                                                                  // const float*                                         pQueuePriorities;
2400         };
2401
2402         const VkDeviceCreateInfo                                                deviceInfo                              =
2403         {
2404                 VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,                                                   // VkStructureType                                      sType;
2405                 &deviceGroupInfo,                                                                                               // const void*                                          pNext;
2406                 (VkDeviceCreateFlags)0,                                                                                 // VkDeviceCreateFlags                          flags;
2407                 1u      ,                                                                                                                       // uint32_t                                                     queueCreateInfoCount;
2408                 &queueInfo,                                                                                                             // const VkDeviceQueueCreateInfo*       pQueueCreateInfos;
2409                 0u,                                                                                                                             // uint32_t                                                     enabledLayerCount;
2410                 DE_NULL,                                                                                                                // const char* const*                           ppEnabledLayerNames;
2411                 deUint32(deviceExtensions.size()),                                                              // uint32_t                                                     enabledExtensionCount;
2412                 (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]),    // const char* const*                           ppEnabledExtensionNames;
2413                 &deviceFeatures,                                                                                                // const VkPhysicalDeviceFeatures*      pEnabledFeatures;
2414         };
2415
2416         m_logicalDevice         = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2417         m_deviceDriver          = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2418 }
2419
2420 class DispatchBaseTest : public vkt::TestCase
2421 {
2422 public:
2423                                                 DispatchBaseTest        (tcu::TestContext&      testCtx,
2424                                                                                         const std::string&      name,
2425                                                                                         const std::string&      description,
2426                                                                                         const deUint32          numValues,
2427                                                                                         const tcu::IVec3&       localsize,
2428                                                                                         const tcu::IVec3&       worksize,
2429                                                                                         const tcu::IVec3&       splitsize);
2430
2431         void                            initPrograms            (SourceCollections& sourceCollections) const;
2432         TestInstance*           createInstance          (Context&                       context) const;
2433
2434 private:
2435         const deUint32                                  m_numValues;
2436         const tcu::IVec3                                m_localSize;
2437         const tcu::IVec3                                m_workSize;
2438         const tcu::IVec3                                m_splitSize;
2439 };
2440
2441 class DispatchBaseTestInstance : public ComputeTestInstance
2442 {
2443 public:
2444                                                                         DispatchBaseTestInstance        (Context&                       context,
2445                                                                                                                                 const deUint32          numValues,
2446                                                                                                                                 const tcu::IVec3&       localsize,
2447                                                                                                                                 const tcu::IVec3&       worksize,
2448                                                                                                                                 const tcu::IVec3&       splitsize);
2449
2450         bool                                                    isInputVectorValid                      (const tcu::IVec3& small, const tcu::IVec3& big);
2451         tcu::TestStatus                                 iterate                                         (void);
2452
2453 private:
2454         const deUint32                                  m_numValues;
2455         const tcu::IVec3                                m_localSize;
2456         const tcu::IVec3                                m_workSize;
2457         const tcu::IVec3                                m_splitWorkSize;
2458 };
2459
2460 DispatchBaseTest::DispatchBaseTest (tcu::TestContext&   testCtx,
2461                                                                         const std::string&      name,
2462                                                                         const std::string&      description,
2463                                                                         const deUint32          numValues,
2464                                                                         const tcu::IVec3&       localsize,
2465                                                                         const tcu::IVec3&       worksize,
2466                                                                         const tcu::IVec3&       splitsize)
2467         : TestCase              (testCtx, name, description)
2468         , m_numValues   (numValues)
2469         , m_localSize   (localsize)
2470         , m_workSize    (worksize)
2471         , m_splitSize   (splitsize)
2472 {
2473 }
2474
2475 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2476 {
2477         std::ostringstream src;
2478         src << "#version 310 es\n"
2479                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2480
2481                 << "layout(binding = 0) buffer InOut {\n"
2482                 << "    uint values[" << de::toString(m_numValues) << "];\n"
2483                 << "} sb_inout;\n"
2484
2485                 << "layout(binding = 1) readonly uniform uniformInput {\n"
2486                 << "    uvec3 gridSize;\n"
2487                 << "} ubo_in;\n"
2488
2489                 << "void main (void) {\n"
2490                 << "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2491                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2492                 << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2493                 << "    uint offset = numValuesPerInv*index;\n"
2494                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2495                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2496                 << "}\n";
2497
2498         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2499 }
2500
2501 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2502 {
2503         return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2504 }
2505
2506 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2507                                                                                                         const deUint32          numValues,
2508                                                                                                         const tcu::IVec3&       localsize,
2509                                                                                                         const tcu::IVec3&       worksize,
2510                                                                                                         const tcu::IVec3&       splitsize)
2511
2512         : ComputeTestInstance   (context)
2513         , m_numValues                   (numValues)
2514         , m_localSize                   (localsize)
2515         , m_workSize                    (worksize)
2516         , m_splitWorkSize               (splitsize)
2517 {
2518         // For easy work distribution across physical devices:
2519         // WorkSize should be a multiple of SplitWorkSize only in the X component
2520         if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2521                 (m_workSize.x() <= m_splitWorkSize.x()) ||
2522                 (m_workSize.y() != m_splitWorkSize.y()) ||
2523                 (m_workSize.z() != m_splitWorkSize.z()))
2524                 TCU_THROW(TestError, "Invalid Input.");
2525
2526         // For easy work distribution within the same physical device:
2527         // SplitWorkSize should be a multiple of localSize in Y or Z component
2528         if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2529                 (m_localSize.x() != m_splitWorkSize.x()) ||
2530                 (m_localSize.y() >= m_splitWorkSize.y()) ||
2531                 (m_localSize.z() >= m_splitWorkSize.z()))
2532                 TCU_THROW(TestError, "Invalid Input.");
2533
2534         if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2535                 TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2536
2537         deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2538         if ((totalWork > numValues) || (numValues % totalWork != 0))
2539                 TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2540 }
2541
2542 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2543 {
2544         if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2545                 ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2546                 return false;
2547         return true;
2548 }
2549
2550 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2551 {
2552         const DeviceInterface&  vk                                      = getDeviceInterface();
2553         const VkDevice                  device                          = getDevice();
2554         const VkQueue                   queue                           = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2555         SimpleAllocator                 allocator                       (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2556         deUint32                                totalWorkloadSize       = 0;
2557
2558         // Create an uniform and input/output buffer
2559         const deUint32 uniformBufSize = 3; // Pass the compute grid size
2560         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2561         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2562
2563         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2564         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2565
2566         // Fill the buffers with data
2567         typedef std::vector<deUint32> data_vector_t;
2568         data_vector_t uniformInputData(uniformBufSize);
2569         data_vector_t inputData(m_numValues);
2570
2571         {
2572                 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2573                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2574                 uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2575                 uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2576                 uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2577                 flushAlloc(vk, device, bufferAllocation);
2578         }
2579
2580         {
2581                 de::Random rnd(0x82ce7f);
2582                 const Allocation& bufferAllocation = buffer.getAllocation();
2583                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2584                 for (deUint32 i = 0; i < m_numValues; ++i)
2585                         inputData[i] = *bufferPtr++ = rnd.getUint32();
2586
2587                 flushAlloc(vk, device, bufferAllocation);
2588         }
2589
2590         // Create descriptor set
2591         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2592                 DescriptorSetLayoutBuilder()
2593                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2594                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2595                 .build(vk, device));
2596
2597         const Unique<VkDescriptorPool> descriptorPool(
2598                 DescriptorPoolBuilder()
2599                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2600                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2601                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2602
2603         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2604
2605         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2606         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2607
2608         DescriptorSetUpdateBuilder()
2609                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2610                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2611                 .update(vk, device);
2612
2613         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2614         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2615         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2616
2617         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2618         const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2619
2620         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2621
2622         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2623         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2624
2625         // Start recording commands
2626         beginCommandBuffer(vk, *cmdBuffer);
2627
2628         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2629         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2630
2631         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2632
2633         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2634
2635         // Split the workload across all physical devices based on m_splitWorkSize.x()
2636         for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2637         {
2638                 deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2639                 deUint32 baseGroupY = 0;
2640                 deUint32 baseGroupZ = 0;
2641
2642                 // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2643                 for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2644                 {
2645                         for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2646                         {
2647                                 deUint32 offsetX = baseGroupX;
2648                                 deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2649                                 deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2650
2651                                 deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2652                                 deUint32 localSizeY = m_localSize.y();
2653                                 deUint32 localSizeZ = m_localSize.z();
2654
2655                                 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2656                                 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2657                         }
2658                 }
2659         }
2660
2661         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2662
2663         endCommandBuffer(vk, *cmdBuffer);
2664         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2665
2666         if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2667                 TCU_THROW(TestError, "Not covering the entire workload.");
2668
2669         // Validate the results
2670         const Allocation& bufferAllocation = buffer.getAllocation();
2671         invalidateAlloc(vk, device, bufferAllocation);
2672         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2673
2674         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2675         {
2676                 const deUint32 res = bufferPtr[ndx];
2677                 const deUint32 ref = ~inputData[ndx];
2678
2679                 if (res != ref)
2680                 {
2681                         std::ostringstream msg;
2682                         msg << "Comparison failed for InOut.values[" << ndx << "]";
2683                         return tcu::TestStatus::fail(msg.str());
2684                 }
2685         }
2686         return tcu::TestStatus::pass("Compute succeeded");
2687 }
2688
2689 class DeviceIndexTest : public vkt::TestCase
2690 {
2691 public:
2692         DeviceIndexTest         (tcu::TestContext&      testCtx,
2693                                                                                         const std::string&      name,
2694                                                                                         const std::string&      description,
2695                                                                                         const deUint32          numValues,
2696                                                                                         const tcu::IVec3&       localsize,
2697                                                                                         const tcu::IVec3&       splitsize);
2698
2699         void                            initPrograms            (SourceCollections& sourceCollections) const;
2700         TestInstance*           createInstance          (Context&                       context) const;
2701
2702 private:
2703         const deUint32                                  m_numValues;
2704         const tcu::IVec3                                m_localSize;
2705         const tcu::IVec3                                m_workSize;
2706         const tcu::IVec3                                m_splitSize;
2707 };
2708
2709 class DeviceIndexTestInstance : public ComputeTestInstance
2710 {
2711 public:
2712                                                                         DeviceIndexTestInstance (Context&                       context,
2713                                                                                                                                 const deUint32          numValues,
2714                                                                                                                                 const tcu::IVec3&       localsize,
2715                                                                                                                                 const tcu::IVec3&       worksize);
2716         tcu::TestStatus                                 iterate                                         (void);
2717 private:
2718         const deUint32                                  m_numValues;
2719         const tcu::IVec3                                m_localSize;
2720         tcu::IVec3                                              m_workSize;
2721 };
2722
2723 DeviceIndexTest::DeviceIndexTest (tcu::TestContext&     testCtx,
2724                                                                         const std::string&      name,
2725                                                                         const std::string&      description,
2726                                                                         const deUint32          numValues,
2727                                                                         const tcu::IVec3&       localsize,
2728                                                                         const tcu::IVec3&       worksize)
2729         : TestCase              (testCtx, name, description)
2730         , m_numValues   (numValues)
2731         , m_localSize   (localsize)
2732         , m_workSize    (worksize)
2733 {
2734 }
2735
2736 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2737 {
2738         std::ostringstream src;
2739         src << "#version 310 es\n"
2740                 << "#extension GL_EXT_device_group : require\n"
2741                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2742
2743                 << "layout(binding = 0) buffer InOut {\n"
2744                 << "    uint values[" << de::toString(m_numValues) << "];\n"
2745                 << "} sb_inout;\n"
2746
2747                 << "layout(binding = 1) readonly uniform uniformInput {\n"
2748                 << "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE_KHR << "];\n"
2749                 << "} ubo_in;\n"
2750
2751                 << "void main (void) {\n"
2752                 << "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2753                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2754                 << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2755                 << "    uint offset = numValuesPerInv*index;\n"
2756                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2757                 << "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2758                 << "}\n";
2759
2760         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2761 }
2762
2763 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2764 {
2765         return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2766 }
2767
2768 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2769                                                                                                         const deUint32          numValues,
2770                                                                                                         const tcu::IVec3&       localsize,
2771                                                                                                         const tcu::IVec3&       worksize)
2772
2773         : ComputeTestInstance   (context)
2774         , m_numValues                   (numValues)
2775         , m_localSize                   (localsize)
2776         , m_workSize                    (worksize)
2777 {}
2778
2779 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2780 {
2781         const DeviceInterface&                  vk                                      = getDeviceInterface();
2782         const VkDevice                                  device                          = getDevice();
2783         const VkQueue                                   queue                           = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2784         SimpleAllocator                                 allocator                       (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2785         const deUint32                                  allocDeviceMask         = (1 << m_numPhysDevices) - 1;
2786         de::Random                                              rnd                                     (0x82ce7f);
2787         Move<VkBuffer>                                  sboBuffer;
2788         vk::Move<vk::VkDeviceMemory>    sboBufferMemory;
2789
2790         // Create an uniform and output buffer
2791         const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE_KHR);
2792         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2793         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2794
2795         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2796         const Buffer checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2797
2798         // create SBO buffer
2799         {
2800                 const VkBufferCreateInfo        sboBufferParams =
2801                 {
2802                         VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,                                                                   // sType
2803                         DE_NULL,                                                                                                                                // pNext
2804                         0u,                                                                                                                                             // flags
2805                         (VkDeviceSize)bufferSizeBytes,                                                                                  // size
2806                         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,  // usage
2807                         VK_SHARING_MODE_EXCLUSIVE,                                                                                              // sharingMode
2808                         1u,                                                                                                                                             // queueFamilyIndexCount
2809                         &m_queueFamilyIndex,                                                                                                            // pQueueFamilyIndices
2810                 };
2811                 sboBuffer = createBuffer(vk, device, &sboBufferParams);
2812
2813                 VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2814                 deUint32 memoryTypeNdx = 0;
2815                 const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2816                 for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2817                 {
2818                         if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2819                                 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2820                                 break;
2821                 }
2822                 if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2823                         TCU_THROW(NotSupportedError, "No compatible memory type found");
2824
2825                 const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2826                 {
2827                         VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR,       // sType
2828                         DE_NULL,                                                                                        // pNext
2829                         VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,                                     // flags
2830                         allocDeviceMask,                                                                        // deviceMask
2831                 };
2832
2833                 VkMemoryAllocateInfo            allocInfo =
2834                 {
2835                         VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,                 // sType
2836                         &allocDeviceMaskInfo,                                                   // pNext
2837                         memReqs.size,                                                                   // allocationSize
2838                         memoryTypeNdx,                                                                  // memoryTypeIndex
2839                 };
2840
2841                 sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2842                 VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2843         }
2844
2845         // Fill the buffers with data
2846         typedef std::vector<deUint32> data_vector_t;
2847         data_vector_t uniformInputData(uniformBufSize, 0);
2848
2849         {
2850                 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2851                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2852                 for (deUint32 i = 0; i < uniformBufSize; ++i)
2853                         uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2854
2855                 flushAlloc(vk, device, bufferAllocation);
2856         }
2857
2858         // Create descriptor set
2859         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2860                 DescriptorSetLayoutBuilder()
2861                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2862                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2863                 .build(vk, device));
2864
2865         const Unique<VkDescriptorPool> descriptorPool(
2866                 DescriptorPoolBuilder()
2867                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2868                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2869                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2870
2871         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2872
2873         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2874         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2875
2876         DescriptorSetUpdateBuilder()
2877                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2878                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2879                 .update(vk, device);
2880
2881         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2882         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2883         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2884
2885         const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2886         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2887
2888         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2889         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2890
2891         // Verify multiple device masks
2892         for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2893         {
2894                 deUint32 constantValPerLoop = 0;
2895                 {
2896                         const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2897                         deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2898                         constantValPerLoop = *bufferPtr = rnd.getUint32() / 10;  // divide to prevent overflow in addition
2899                         flushAlloc(vk, device, bufferAllocation);
2900                 }
2901                 beginCommandBuffer(vk, *cmdBuffer);
2902
2903                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2904                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2905                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2906
2907                 vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2908                 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2909
2910                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2911
2912                 endCommandBuffer(vk, *cmdBuffer);
2913                 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2914
2915                 // Validate the results on all physical devices where compute shader was launched
2916                 const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2917                 const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2918                 const VkBufferCopy      copyParams =
2919                 {
2920                         (VkDeviceSize)0u,                                               // srcOffset
2921                         (VkDeviceSize)0u,                                               // dstOffset
2922                         bufferSizeBytes                                                 // size
2923                 };
2924
2925                 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2926                 {
2927                         if (!(1<<physDevIdx & physDevMask))
2928                                 continue;
2929
2930                         const deUint32 deviceMask = 1 << physDevIdx;
2931
2932                         beginCommandBuffer(vk, *cmdBuffer);
2933                         vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2934                         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2935                         vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
2936                         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2937
2938                         endCommandBuffer(vk, *cmdBuffer);
2939                         submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2940
2941                         const Allocation& bufferAllocation = checkBuffer.getAllocation();
2942                         invalidateAlloc(vk, device, bufferAllocation);
2943                         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2944
2945                         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2946                         {
2947                                 const deUint32 res = bufferPtr[ndx];
2948                                 const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
2949
2950                                 if (res != ref)
2951                                 {
2952                                         std::ostringstream msg;
2953                                         msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
2954                                         return tcu::TestStatus::fail(msg.str());
2955                                 }
2956                         }
2957                 }
2958         }
2959
2960         return tcu::TestStatus::pass("Compute succeeded");
2961 }
2962
2963 class ConcurrentCompute : public vkt::TestCase
2964 {
2965 public:
2966                                                 ConcurrentCompute       (tcu::TestContext&      testCtx,
2967                                                                                          const std::string&     name,
2968                                                                                          const std::string&     description);
2969
2970
2971         void                            initPrograms            (SourceCollections& sourceCollections) const;
2972         TestInstance*           createInstance          (Context&                       context) const;
2973 };
2974
2975 class ConcurrentComputeInstance : public vkt::TestInstance
2976 {
2977 public:
2978                                                                         ConcurrentComputeInstance       (Context& context);
2979
2980         tcu::TestStatus                                 iterate                                         (void);
2981 };
2982
2983 ConcurrentCompute::ConcurrentCompute (tcu::TestContext& testCtx,
2984                                                                           const std::string&    name,
2985                                                                           const std::string&    description)
2986         : TestCase              (testCtx, name, description)
2987 {
2988 }
2989
2990 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
2991 {
2992         std::ostringstream src;
2993         src << "#version 310 es\n"
2994                 << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
2995                 << "layout(binding = 0) buffer InOut {\n"
2996                 << "    uint values[1024];\n"
2997                 << "} sb_inout;\n"
2998                 << "void main (void) {\n"
2999                 << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3000                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3001                 << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3002                 << "    uint offset          = numValuesPerInv*groupNdx;\n"
3003                 << "\n"
3004                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3005                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3006                 << "}\n";
3007
3008         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3009 }
3010
3011 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3012 {
3013         return new ConcurrentComputeInstance(context);
3014 }
3015
3016 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3017         : TestInstance  (context)
3018 {
3019 }
3020
3021 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3022 {
3023         enum {
3024                 NO_MATCH_FOUND  = ~((deUint32)0),
3025                 ERROR_NONE              = 0,
3026                 ERROR_WAIT              = 1,
3027                 ERROR_ORDER             = 2
3028         };
3029
3030         struct Queues
3031         {
3032                 VkQueue         queue;
3033                 deUint32        queueFamilyIndex;
3034         };
3035
3036         const DeviceInterface&                                  vk                                                      = m_context.getDeviceInterface();
3037         const deUint32                                                  numValues                                       = 1024;
3038         const InstanceInterface&                                instance                                        = m_context.getInstanceInterface();
3039         const VkPhysicalDevice                                  physicalDevice                          = m_context.getPhysicalDevice();
3040         tcu::TestLog&                                                   log                                                     = m_context.getTestContext().getLog();
3041         vk::Move<vk::VkDevice>                                  logicalDevice;
3042         std::vector<VkQueueFamilyProperties>    queueFamilyProperties;
3043         VkDeviceCreateInfo                                              deviceInfo;
3044         VkPhysicalDeviceFeatures                                deviceFeatures;
3045         const float                                                             queuePriorities[2]                      = {1.0f, 0.0f};
3046         VkDeviceQueueCreateInfo                                 queueInfos[2];
3047         Queues                                                                  queues[2]                                       =
3048                                                                                                                                                 {
3049                                                                                                                                                         {DE_NULL, (deUint32)NO_MATCH_FOUND},
3050                                                                                                                                                         {DE_NULL, (deUint32)NO_MATCH_FOUND}
3051                                                                                                                                                 };
3052
3053         queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instance, physicalDevice);
3054
3055         for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3056         {
3057                 if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3058                 {
3059                         if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3060                                 queues[0].queueFamilyIndex = queueNdx;
3061
3062                         if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3063                         {
3064                                 queues[1].queueFamilyIndex = queueNdx;
3065                                 break;
3066                         }
3067                 }
3068         }
3069
3070         if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3071                 TCU_THROW(NotSupportedError, "Queues couldn't be created");
3072
3073         for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3074         {
3075                 VkDeviceQueueCreateInfo queueInfo;
3076                 deMemset(&queueInfo, 0, sizeof(queueInfo));
3077
3078                 queueInfo.sType                         = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3079                 queueInfo.pNext                         = DE_NULL;
3080                 queueInfo.flags                         = (VkDeviceQueueCreateFlags)0u;
3081                 queueInfo.queueFamilyIndex      = queues[queueNdx].queueFamilyIndex;
3082                 queueInfo.queueCount            = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3083                 queueInfo.pQueuePriorities      = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3084
3085                 queueInfos[queueNdx]            = queueInfo;
3086
3087                 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3088                         break;
3089         }
3090         deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3091         instance.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3092
3093         deviceInfo.sType                                        = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3094         deviceInfo.pNext                                        = DE_NULL;
3095         deviceInfo.enabledExtensionCount        = 0u;
3096         deviceInfo.ppEnabledExtensionNames      = DE_NULL;
3097         deviceInfo.enabledLayerCount            = 0u;
3098         deviceInfo.ppEnabledLayerNames          = DE_NULL;
3099         deviceInfo.pEnabledFeatures                     = &deviceFeatures;
3100         deviceInfo.queueCreateInfoCount         = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3101         deviceInfo.pQueueCreateInfos            = queueInfos;
3102
3103         logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_context.getInstance(), instance, physicalDevice, &deviceInfo);
3104
3105         for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3106         {
3107                 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3108                         vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3109                 else
3110                         vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3111         }
3112
3113         // Create an input/output buffers
3114         const VkPhysicalDeviceMemoryProperties memoryProperties = vk::getPhysicalDeviceMemoryProperties(instance, physicalDevice);
3115
3116         SimpleAllocator *allocator                                                              = new SimpleAllocator(vk, *logicalDevice, memoryProperties);
3117         const VkDeviceSize bufferSizeBytes                                              = sizeof(deUint32) * numValues;
3118         const Buffer buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3119         const Buffer buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3120
3121         // Fill the buffers with data
3122
3123         typedef std::vector<deUint32> data_vector_t;
3124         data_vector_t inputData(numValues);
3125
3126         {
3127                 de::Random rnd(0x82ce7f);
3128                 const Allocation& bufferAllocation1     = buffer1.getAllocation();
3129                 const Allocation& bufferAllocation2     = buffer2.getAllocation();
3130                 deUint32* bufferPtr1                            = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3131                 deUint32* bufferPtr2                            = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3132
3133                 for (deUint32 i = 0; i < numValues; ++i)
3134                 {
3135                         deUint32 val = rnd.getUint32();
3136                         inputData[i] = val;
3137                         *bufferPtr1++ = val;
3138                         *bufferPtr2++ = val;
3139                 }
3140
3141                 flushAlloc(vk, *logicalDevice, bufferAllocation1);
3142                 flushAlloc(vk, *logicalDevice, bufferAllocation2);
3143         }
3144
3145         // Create descriptor sets
3146
3147         const Unique<VkDescriptorSetLayout>     descriptorSetLayout1(
3148                 DescriptorSetLayoutBuilder()
3149                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3150                 .build(vk, *logicalDevice));
3151
3152         const Unique<VkDescriptorPool>          descriptorPool1(
3153                 DescriptorPoolBuilder()
3154                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3155                 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3156
3157         const Unique<VkDescriptorSet>           descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3158
3159         const VkDescriptorBufferInfo            bufferDescriptorInfo1   = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3160                 DescriptorSetUpdateBuilder()
3161                 .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3162                 .update(vk, *logicalDevice);
3163
3164         const Unique<VkDescriptorSetLayout>     descriptorSetLayout2(
3165                 DescriptorSetLayoutBuilder()
3166                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3167                 .build(vk, *logicalDevice));
3168
3169         const Unique<VkDescriptorPool>          descriptorPool2(
3170                 DescriptorPoolBuilder()
3171                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3172                 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3173
3174         const Unique<VkDescriptorSet>           descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3175
3176         const VkDescriptorBufferInfo            bufferDescriptorInfo2   = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3177                 DescriptorSetUpdateBuilder()
3178                 .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3179                 .update(vk, *logicalDevice);
3180
3181         // Perform the computation
3182
3183         const Unique<VkShaderModule>            shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3184
3185         const Unique<VkPipelineLayout>          pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3186         const Unique<VkPipeline>                        pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3187         const VkBufferMemoryBarrier                     hostWriteBarrier1               = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3188         const VkBufferMemoryBarrier                     shaderWriteBarrier1             = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3189         const Unique<VkCommandPool>                     cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3190         const Unique<VkCommandBuffer>           cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3191
3192         const Unique<VkPipelineLayout>          pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3193         const Unique<VkPipeline>                        pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3194         const VkBufferMemoryBarrier                     hostWriteBarrier2               = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3195         const VkBufferMemoryBarrier                     shaderWriteBarrier2             = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3196         const Unique<VkCommandPool>                     cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3197         const Unique<VkCommandBuffer>           cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3198
3199         // Command buffer 1
3200
3201         beginCommandBuffer(vk, *cmdBuffer1);
3202         vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3203         vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3204         vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3205         vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3206         vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3207         endCommandBuffer(vk, *cmdBuffer1);
3208
3209         // Command buffer 2
3210
3211         beginCommandBuffer(vk, *cmdBuffer2);
3212         vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3213         vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3214         vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3215         vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3216         vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3217         endCommandBuffer(vk, *cmdBuffer2);
3218
3219         VkSubmitInfo    submitInfo1 =
3220         {
3221                 VK_STRUCTURE_TYPE_SUBMIT_INFO,                  // sType
3222                 DE_NULL,                                                                // pNext
3223                 0u,                                                                             // waitSemaphoreCount
3224                 DE_NULL,                                                                // pWaitSemaphores
3225                 (const VkPipelineStageFlags*)DE_NULL,   // pWaitDstStageMask
3226                 1u,                                                                             // commandBufferCount
3227                 &cmdBuffer1.get(),                                              // pCommandBuffers
3228                 0u,                                                                             // signalSemaphoreCount
3229                 DE_NULL                                                                 // pSignalSemaphores
3230         };
3231
3232         VkSubmitInfo    submitInfo2 =
3233         {
3234                 VK_STRUCTURE_TYPE_SUBMIT_INFO,                  // sType
3235                 DE_NULL,                                                                // pNext
3236                 0u,                                                                             // waitSemaphoreCount
3237                 DE_NULL,                                                                // pWaitSemaphores
3238                 (const VkPipelineStageFlags*)DE_NULL,   // pWaitDstStageMask
3239                 1u,                                                                             // commandBufferCount
3240                 &cmdBuffer2.get(),                                              // pCommandBuffers
3241                 0u,                                                                             // signalSemaphoreCount
3242                 DE_NULL                                                                 // pSignalSemaphores
3243         };
3244
3245         // Wait for completion
3246         const Unique<VkFence>   fence1(createFence(vk, *logicalDevice));
3247         const Unique<VkFence>   fence2(createFence(vk, *logicalDevice));
3248
3249         VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3250         VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3251
3252         int err = ERROR_NONE;
3253
3254         // First wait for the low-priority queue
3255         if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3256                 err = ERROR_WAIT;
3257
3258         // If the high-priority queue hasn't finished, we have a problem.
3259         if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3260                 if (err == ERROR_NONE)
3261                         err = ERROR_ORDER;
3262
3263         // Wait for the high-priority fence so we don't get errors on teardown.
3264         vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3265
3266         // If we fail() before waiting for all of the fences, error will come from
3267         // teardown instead of the error we want.
3268
3269         if (err == ERROR_WAIT)
3270                 return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3271
3272         // Validate the results
3273
3274         const Allocation& bufferAllocation1     = buffer1.getAllocation();
3275         invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3276         const deUint32* bufferPtr1                      = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3277
3278         const Allocation& bufferAllocation2     = buffer2.getAllocation();
3279         invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3280         const deUint32* bufferPtr2                      = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3281
3282         for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3283         {
3284                 const deUint32 res1     = bufferPtr1[ndx];
3285                 const deUint32 res2     = bufferPtr2[ndx];
3286                 const deUint32 inp      = inputData[ndx];
3287                 const deUint32 ref      = ~inp;
3288
3289                 if (res1 != ref || res1 != res2)
3290                 {
3291                         std::ostringstream msg;
3292                         msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3293                         return tcu::TestStatus::fail(msg.str());
3294                 }
3295         }
3296
3297         if (err == ERROR_ORDER)
3298                 log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3299
3300         return tcu::TestStatus::pass("Test passed");
3301 }
3302
3303 class MaxWorkGroupSizeTest : public vkt::TestCase
3304 {
3305 public:
3306         enum class Axis { X = 0, Y = 1, Z = 2 };
3307
3308         struct Params
3309         {
3310                 // Which axis to maximize.
3311                 Axis axis;
3312         };
3313
3314                                                         MaxWorkGroupSizeTest    (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
3315         virtual                                 ~MaxWorkGroupSizeTest   (void) {}
3316
3317         virtual void                    initPrograms                    (vk::SourceCollections& programCollection) const;
3318         virtual TestInstance*   createInstance                  (Context& context) const;
3319         virtual void                    checkSupport                    (Context& context) const;
3320
3321         // Helper to transform the axis value to an index.
3322         static int                              getIndex                                (Axis axis);
3323
3324         // Helper returning the number of invocations according to the test parameters.
3325         static deUint32                 getInvocations                  (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3326
3327         // Helper returning the buffer size needed to this test.
3328         static deUint32                 getSSBOSize                             (deUint32 invocations);
3329
3330 private:
3331         Params m_params;
3332 };
3333
3334 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3335 {
3336 public:
3337                                                                 MaxWorkGroupSizeInstance        (Context& context, const MaxWorkGroupSizeTest::Params& params);
3338         virtual                                         ~MaxWorkGroupSizeInstance       (void) {}
3339
3340         virtual tcu::TestStatus         iterate                 (void);
3341
3342 private:
3343         MaxWorkGroupSizeTest::Params m_params;
3344 };
3345
3346 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3347 {
3348         const int ret = static_cast<int>(axis);
3349         DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3350         return ret;
3351 }
3352
3353 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3354 {
3355         const auto axis = getIndex(params.axis);
3356
3357         if (devProperties)
3358                 return devProperties->limits.maxComputeWorkGroupSize[axis];
3359         return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3360 }
3361
3362 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3363 {
3364         return invocations * static_cast<deUint32>(sizeof(deUint32));
3365 }
3366
3367 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3368         : vkt::TestCase (testCtx, name, description)
3369         , m_params              (params)
3370 {}
3371
3372 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3373 {
3374         std::ostringstream shader;
3375
3376         // The actual local sizes will be set using spec constants when running the test instance.
3377         shader
3378                 << "#version 450\n"
3379                 << "\n"
3380                 << "layout(constant_id=0) const int local_size_x_val = 1;\n"
3381                 << "layout(constant_id=1) const int local_size_y_val = 1;\n"
3382                 << "layout(constant_id=2) const int local_size_z_val = 1;\n"
3383                 << "\n"
3384                 << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3385                 << "\n"
3386                 << "layout(set=0, binding=0) buffer StorageBuffer {\n"
3387                 << "    uint values[];\n"
3388                 << "} ssbo;\n"
3389                 << "\n"
3390                 << "void main() {\n"
3391                 << "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3392                 << "}\n"
3393                 ;
3394
3395         programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3396 }
3397
3398 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3399 {
3400         return new MaxWorkGroupSizeInstance(context, m_params);
3401 }
3402
3403 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3404 {
3405         const auto&     vki                             = context.getInstanceInterface();
3406         const auto      physicalDevice  = context.getPhysicalDevice();
3407
3408         const auto      properties              = vk::getPhysicalDeviceProperties(vki, physicalDevice);
3409         const auto      invocations             = getInvocations(m_params, vki, physicalDevice, &properties);
3410
3411         if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3412                 TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3413
3414         if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3415                 TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3416 }
3417
3418 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3419         : vkt::TestInstance     (context)
3420         , m_params                      (params)
3421 {}
3422
3423 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3424 {
3425         const auto&     vki                             = m_context.getInstanceInterface();
3426         const auto&     vkd                             = m_context.getDeviceInterface();
3427         const auto      physicalDevice  = m_context.getPhysicalDevice();
3428         const auto      device                  = m_context.getDevice();
3429         auto&           alloc                   = m_context.getDefaultAllocator();
3430         const auto      queueIndex              = m_context.getUniversalQueueFamilyIndex();
3431         const auto      queue                   = m_context.getUniversalQueue();
3432         auto&           log                             = m_context.getTestContext().getLog();
3433
3434         const auto      axis                    = MaxWorkGroupSizeTest::getIndex(m_params.axis);
3435         const auto      invocations             = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3436         const auto      ssboSize                = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3437
3438         log
3439                 << tcu::TestLog::Message
3440                 << "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3441                 << tcu::TestLog::EndMessage
3442                 ;
3443
3444         // Main SSBO buffer.
3445         const auto                              ssboInfo        = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3446         vk::BufferWithMemory    ssbo            (vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3447
3448         // Shader module.
3449         const auto shaderModule = vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3450
3451         // Descriptor set layouts.
3452         vk::DescriptorSetLayoutBuilder layoutBuilder;
3453         layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3454         const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3455
3456         // Specialization constants: set the number of invocations in the appropriate local size id.
3457         const auto      entrySize                               = static_cast<deUintptr>(sizeof(deInt32));
3458         deInt32         specializationData[3]   = { 1, 1, 1 };
3459         specializationData[axis] = static_cast<deInt32>(invocations);
3460
3461         const vk::VkSpecializationMapEntry specializationMaps[3] =
3462         {
3463                 {
3464                         0u,                                                                             //      deUint32        constantID;
3465                         0u,                                                                             //      deUint32        offset;
3466                         entrySize,                                                              //      deUintptr       size;
3467                 },
3468                 {
3469                         1u,                                                                             //      deUint32        constantID;
3470                         static_cast<deUint32>(entrySize),               //      deUint32        offset;
3471                         entrySize,                                                              //      deUintptr       size;
3472                 },
3473                 {
3474                         2u,                                                                             //      deUint32        constantID;
3475                         static_cast<deUint32>(entrySize * 2u),  //      deUint32        offset;
3476                         entrySize,                                                              //      deUintptr       size;
3477                 },
3478         };
3479
3480         const vk::VkSpecializationInfo specializationInfo =
3481         {
3482                 3u,                                                                                                     //      deUint32                                                mapEntryCount;
3483                 specializationMaps,                                                                     //      const VkSpecializationMapEntry* pMapEntries;
3484                 static_cast<deUintptr>(sizeof(specializationData)),     //      deUintptr                                               dataSize;
3485                 specializationData,                                                                     //      const void*                                             pData;
3486         };
3487
3488         // Test pipeline.
3489         const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3490         {
3491                 vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,      //      VkStructureType                                 sType;
3492                 nullptr,                                                                                        //      const void*                                             pNext;
3493                 0u,                                                                                                     //      VkPipelineLayoutCreateFlags             flags;
3494                 1u,                                                                                                     //      deUint32                                                setLayoutCount;
3495                 &descriptorSetLayout.get(),                                                     //      const VkDescriptorSetLayout*    pSetLayouts;
3496                 0u,                                                                                                     //      deUint32                                                pushConstantRangeCount;
3497                 nullptr,                                                                                        //      const VkPushConstantRange*              pPushConstantRanges;
3498         };
3499         const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3500
3501         const vk::VkComputePipelineCreateInfo testPipelineInfo =
3502         {
3503                 vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,     //      VkStructureType                                 sType;
3504                 nullptr,                                                                                        //      const void*                                             pNext;
3505                 0u,                                                                                                     //      VkPipelineCreateFlags                   flags;
3506                 {                                                                                                       //      VkPipelineShaderStageCreateInfo stage;
3507                         vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,//      VkStructureType                                         sType;
3508                         nullptr,                                                                                                //      const void*                                                     pNext;
3509                         0u,                                                                                                             //      VkPipelineShaderStageCreateFlags        flags;
3510                         vk::VK_SHADER_STAGE_COMPUTE_BIT,                                                //      VkShaderStageFlagBits                           stage;
3511                         shaderModule.get(),                                                                             //      VkShaderModule                                          module;
3512                         "main",                                                                                                 //      const char*                                                     pName;
3513                         &specializationInfo,                                                                    //      const VkSpecializationInfo*                     pSpecializationInfo;
3514                 },
3515                 testPipelineLayout.get(),                                                       //      VkPipelineLayout                                layout;
3516                 DE_NULL,                                                                                        //      VkPipeline                                              basePipelineHandle;
3517                 0u,                                                                                                     //      deInt32                                                 basePipelineIndex;
3518         };
3519         const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3520
3521         // Create descriptor pool and set.
3522         vk::DescriptorPoolBuilder poolBuilder;
3523         poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3524         const auto descriptorPool       = poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3525         const auto descriptorSet        = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3526
3527         // Update descriptor set.
3528         const vk::VkDescriptorBufferInfo ssboBufferInfo =
3529         {
3530                 ssbo.get(),             //      VkBuffer                buffer;
3531                 0u,                             //      VkDeviceSize    offset;
3532                 VK_WHOLE_SIZE,  //      VkDeviceSize    range;
3533         };
3534
3535         vk::DescriptorSetUpdateBuilder updateBuilder;
3536         updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3537         updateBuilder.update(vkd, device);
3538
3539         // Clear buffer.
3540         auto& ssboAlloc = ssbo.getAllocation();
3541         void* ssboPtr   = ssboAlloc.getHostPtr();
3542         deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3543         vk::flushAlloc(vkd, device, ssboAlloc);
3544
3545         // Run pipelines.
3546         const auto cmdPool              = vk::makeCommandPool(vkd, device, queueIndex);
3547         const auto cmdBUfferPtr = vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3548         const auto cmdBuffer    = cmdBUfferPtr.get();
3549
3550         vk::beginCommandBuffer(vkd, cmdBuffer);
3551
3552         // Run the main test shader.
3553         const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3554         vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3555
3556         vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3557         vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3558         vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3559
3560         const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3561         vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3562
3563         vk::endCommandBuffer(vkd, cmdBuffer);
3564         vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3565
3566         // Verify buffer contents.
3567         vk::invalidateAlloc(vkd, device, ssboAlloc);
3568         std::unique_ptr<deUint32[]>     valuesArray     (new deUint32[invocations]);
3569         deUint32*                                       valuesPtr       = valuesArray.get();
3570         deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3571
3572         std::string     errorMsg;
3573         bool            ok                      = true;
3574
3575         for (size_t i = 0; i < invocations; ++i)
3576         {
3577                 if (valuesPtr[i] != 1u)
3578                 {
3579                         ok                      = false;
3580                         errorMsg        = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3581                         break;
3582                 }
3583         }
3584
3585         if (!ok)
3586                 return tcu::TestStatus::fail(errorMsg);
3587         return tcu::TestStatus::pass("Pass");
3588 }
3589
3590 namespace EmptyShaderTest
3591 {
3592
3593 void createProgram (SourceCollections& dst)
3594 {
3595         dst.glslSources.add("comp") << glu::ComputeSource(
3596                 "#version 310 es\n"
3597                 "layout (local_size_x = 1) in;\n"
3598                 "void main (void) {}\n"
3599         );
3600 }
3601
3602 tcu::TestStatus createTest (Context& context)
3603 {
3604         const DeviceInterface&  vk                                      = context.getDeviceInterface();
3605         const VkDevice                  device                          = context.getDevice();
3606         const VkQueue                   queue                           = context.getUniversalQueue();
3607         const deUint32                  queueFamilyIndex        = context.getUniversalQueueFamilyIndex();
3608
3609         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3610
3611         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3612         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3613
3614         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3615         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3616
3617         // Start recording commands
3618
3619         beginCommandBuffer(vk, *cmdBuffer);
3620
3621         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3622
3623         const tcu::IVec3 workGroups(1, 1, 1);
3624         vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3625
3626         endCommandBuffer(vk, *cmdBuffer);
3627
3628         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3629
3630         return tcu::TestStatus::pass("Compute succeeded");
3631 }
3632
3633 } // EmptyShaderTest ns
3634 } // anonymous
3635
3636 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3637 {
3638         de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3639
3640         addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3641
3642         basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3643
3644         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3645         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3646         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3647
3648         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_single_invocation",        "Copy from UBO to SSBO, inverting bits",        256,    tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3649         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_single_group",                     "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(2,1,4),      tcu::IVec3(1,1,1)));
3650         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_multiple_invocations",     "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(1,1,1),      tcu::IVec3(2,4,1)));
3651         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_multiple_groups",          "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3652
3653         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_single_invocation",          "Copy between SSBOs, inverting bits",   256,    tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3654         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_multiple_invocations",       "Copy between SSBOs, inverting bits",   1024,   tcu::IVec3(1,1,1),      tcu::IVec3(2,4,1)));
3655         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_multiple_groups",            "Copy between SSBOs, inverting bits",   1024,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3656
3657         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_rw_single_invocation",                    "Read and write same SSBO",             256,    true,   tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3658         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_rw_multiple_groups",                              "Read and write same SSBO",             1024,   true,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3659         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_unsized_arr_single_invocation",   "Read and write same SSBO",             256,    false,  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3660         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_unsized_arr_multiple_groups",             "Read and write same SSBO",             1024,   false,  tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3661
3662         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_arr_single_invocation",                 "Write to multiple SSBOs",      256,    true,   tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3663         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_arr_multiple_groups",                   "Write to multiple SSBOs",      1024,   true,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3664         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_unsized_arr_single_invocation", "Write to multiple SSBOs",      256,    false,  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3665         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_unsized_arr_multiple_groups",   "Write to multiple SSBOs",      1024,   false,  tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3666
3667         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_single_invocation", "SSBO local barrier usage",     tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3668         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_single_group",              "SSBO local barrier usage",     tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3669         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_multiple_groups",   "SSBO local barrier usage",     tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3670
3671         basicComputeTests->addChild(new SSBOBarrierTest(testCtx,        "ssbo_cmd_barrier_single",              "SSBO memory barrier usage",    tcu::IVec3(1,1,1)));
3672         basicComputeTests->addChild(new SSBOBarrierTest(testCtx,        "ssbo_cmd_barrier_multiple",    "SSBO memory barrier usage",    tcu::IVec3(11,5,7)));
3673
3674         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_single_invocation",         "Basic shared variable usage",  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3675         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_single_group",                      "Basic shared variable usage",  tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3676         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_multiple_invocations",      "Basic shared variable usage",  tcu::IVec3(1,1,1),      tcu::IVec3(2,5,4)));
3677         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_multiple_groups",           "Basic shared variable usage",  tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3678
3679         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_single_invocation",           "Atomic operation with shared var",             tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3680         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_single_group",                        "Atomic operation with shared var",             tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3681         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_multiple_invocations",        "Atomic operation with shared var",             tcu::IVec3(1,1,1),      tcu::IVec3(2,5,4)));
3682         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_multiple_groups",                     "Atomic operation with shared var",             tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3683
3684         basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,    "copy_image_to_ssbo_small",     "Image to SSBO copy",   tcu::IVec2(1,1),        tcu::IVec2(64,64)));
3685         basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,    "copy_image_to_ssbo_large",     "Image to SSBO copy",   tcu::IVec2(2,4),        tcu::IVec2(512,512)));
3686
3687         basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,    "copy_ssbo_to_image_small",     "SSBO to image copy",   tcu::IVec2(1, 1),       tcu::IVec2(64, 64)));
3688         basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,    "copy_ssbo_to_image_large",     "SSBO to image copy",   tcu::IVec2(2, 4),       tcu::IVec2(512, 512)));
3689
3690         basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,      "image_atomic_op_local_size_1", "Atomic operation with image",  1,      tcu::IVec2(64,64)));
3691         basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,      "image_atomic_op_local_size_8", "Atomic operation with image",  8,      tcu::IVec2(64,64)));
3692
3693         basicComputeTests->addChild(new ImageBarrierTest(testCtx,       "image_barrier_single",         "Image barrier",        tcu::IVec2(1,1)));
3694         basicComputeTests->addChild(new ImageBarrierTest(testCtx,       "image_barrier_multiple",       "Image barrier",        tcu::IVec2(64,64)));
3695
3696         return basicComputeTests.release();
3697 }
3698
3699 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3700 {
3701         de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3702
3703         deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base",        "Compute shader with base groups",                              32768,  tcu::IVec3(4,2,4),      tcu::IVec3(16,8,8),     tcu::IVec3(4,8,8)));
3704         deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx,  "device_index",         "Compute shader using deviceIndex in SPIRV",    96,             tcu::IVec3(3,2,1),      tcu::IVec3(2,4,1)));
3705
3706         return deviceGroupComputeTests.release();
3707
3708 }
3709 } // compute
3710 } // vkt