Merge vk-gl-cts/vulkan-cts-1.3.2 into vk-gl-cts/main
[platform/upstream/VK-GL-CTS.git] / external / vulkancts / modules / vulkan / compute / vktComputeBasicComputeShaderTests.cpp
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  *//*!
21  * \file
22  * \brief Compute Shader Tests
23  *//*--------------------------------------------------------------------*/
24
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
31
32 #include "vkDefs.hpp"
33 #include "vkRef.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47 #include "vkSafetyCriticalUtil.hpp"
48
49 #include "tcuCommandLine.hpp"
50 #include "tcuTestLog.hpp"
51
52 #include "deStringUtil.hpp"
53 #include "deUniquePtr.hpp"
54 #include "deRandom.hpp"
55
56 #include <vector>
57 #include <memory>
58
59 using namespace vk;
60
61 namespace vkt
62 {
63 namespace compute
64 {
65 namespace
66 {
67
68 template<typename T, int size>
69 T multiplyComponents (const tcu::Vector<T, size>& v)
70 {
71         T accum = 1;
72         for (int i = 0; i < size; ++i)
73                 accum *= v[i];
74         return accum;
75 }
76
77 template<typename T>
78 inline T squared (const T& a)
79 {
80         return a * a;
81 }
82
83 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
84 {
85         const VkImageCreateInfo imageParams =
86         {
87                 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,                            // VkStructureType                      sType;
88                 DE_NULL,                                                                                        // const void*                          pNext;
89                 0u,                                                                                                     // VkImageCreateFlags           flags;
90                 VK_IMAGE_TYPE_2D,                                                                       // VkImageType                          imageType;
91                 VK_FORMAT_R32_UINT,                                                                     // VkFormat                                     format;
92                 vk::makeExtent3D(imageSize.x(), imageSize.y(), 1),      // VkExtent3D                           extent;
93                 1u,                                                                                                     // deUint32                                     mipLevels;
94                 1u,                                                                                                     // deUint32                                     arrayLayers;
95                 VK_SAMPLE_COUNT_1_BIT,                                                          // VkSampleCountFlagBits        samples;
96                 VK_IMAGE_TILING_OPTIMAL,                                                        // VkImageTiling                        tiling;
97                 usage,                                                                                          // VkImageUsageFlags            usage;
98                 VK_SHARING_MODE_EXCLUSIVE,                                                      // VkSharingMode                        sharingMode;
99                 0u,                                                                                                     // deUint32                                     queueFamilyIndexCount;
100                 DE_NULL,                                                                                        // const deUint32*                      pQueueFamilyIndices;
101                 VK_IMAGE_LAYOUT_UNDEFINED,                                                      // VkImageLayout                        initialLayout;
102         };
103         return imageParams;
104 }
105
106 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
107 {
108         return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
109 }
110
111 enum BufferType
112 {
113         BUFFER_TYPE_UNIFORM,
114         BUFFER_TYPE_SSBO,
115 };
116
117 class SharedVarTest : public vkt::TestCase
118 {
119 public:
120                                                 SharedVarTest   (tcu::TestContext&              testCtx,
121                                                                                  const std::string&             name,
122                                                                                  const std::string&             description,
123                                                                                  const tcu::IVec3&              localSize,
124                                                                                  const tcu::IVec3&              workSize);
125
126         void                            initPrograms    (SourceCollections&             sourceCollections) const;
127         TestInstance*           createInstance  (Context&                               context) const;
128
129 private:
130         const tcu::IVec3        m_localSize;
131         const tcu::IVec3        m_workSize;
132 };
133
134 class SharedVarTestInstance : public vkt::TestInstance
135 {
136 public:
137                                                                         SharedVarTestInstance   (Context&                       context,
138                                                                                                                          const tcu::IVec3&      localSize,
139                                                                                                                          const tcu::IVec3&      workSize);
140
141         tcu::TestStatus                                 iterate                                 (void);
142
143 private:
144         const tcu::IVec3                                m_localSize;
145         const tcu::IVec3                                m_workSize;
146 };
147
148 SharedVarTest::SharedVarTest (tcu::TestContext&         testCtx,
149                                                           const std::string&    name,
150                                                           const std::string&    description,
151                                                           const tcu::IVec3&             localSize,
152                                                           const tcu::IVec3&             workSize)
153         : TestCase              (testCtx, name, description)
154         , m_localSize   (localSize)
155         , m_workSize    (workSize)
156 {
157 }
158
159 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
160 {
161         const int workGroupSize = multiplyComponents(m_localSize);
162         const int workGroupCount = multiplyComponents(m_workSize);
163         const int numValues = workGroupSize * workGroupCount;
164
165         std::ostringstream src;
166         src << "#version 310 es\n"
167                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
168                 << "layout(binding = 0) writeonly buffer Output {\n"
169                 << "    uint values[" << numValues << "];\n"
170                 << "} sb_out;\n\n"
171                 << "shared uint offsets[" << workGroupSize << "];\n\n"
172                 << "void main (void) {\n"
173                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
174                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
175                 << "    uint globalOffs = localSize*globalNdx;\n"
176                 << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
177                 << "\n"
178                 << "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
179                 << "    memoryBarrierShared();\n"
180                 << "    barrier();\n"
181                 << "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
182                 << "}\n";
183
184         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
185 }
186
187 TestInstance* SharedVarTest::createInstance (Context& context) const
188 {
189         return new SharedVarTestInstance(context, m_localSize, m_workSize);
190 }
191
192 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
193         : TestInstance  (context)
194         , m_localSize   (localSize)
195         , m_workSize    (workSize)
196 {
197 }
198
199 tcu::TestStatus SharedVarTestInstance::iterate (void)
200 {
201         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
202         const VkDevice                  device                          = m_context.getDevice();
203         const VkQueue                   queue                           = m_context.getUniversalQueue();
204         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
205         Allocator&                              allocator                       = m_context.getDefaultAllocator();
206
207         const int workGroupSize = multiplyComponents(m_localSize);
208         const int workGroupCount = multiplyComponents(m_workSize);
209
210         // Create a buffer and host-visible memory for it
211
212         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
213         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
214
215         // Create descriptor set
216
217         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
218                 DescriptorSetLayoutBuilder()
219                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
220                 .build(vk, device));
221
222         const Unique<VkDescriptorPool> descriptorPool(
223                 DescriptorPoolBuilder()
224                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
225                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
226
227         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
228
229         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
230         DescriptorSetUpdateBuilder()
231                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
232                 .update(vk, device);
233
234         // Perform the computation
235
236         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
237         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
238         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
239
240         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
241
242         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
243         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
244
245         // Start recording commands
246
247         beginCommandBuffer(vk, *cmdBuffer);
248
249         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
250         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
251
252         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
253
254         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
255
256         endCommandBuffer(vk, *cmdBuffer);
257
258         // Wait for completion
259
260         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
261
262         // Validate the results
263
264         const Allocation& bufferAllocation = buffer.getAllocation();
265         invalidateAlloc(vk, device, bufferAllocation);
266
267         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
268
269         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
270         {
271                 const int globalOffset = groupNdx * workGroupSize;
272                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
273                 {
274                         const deUint32 res = bufferPtr[globalOffset + localOffset];
275                         const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
276
277                         if (res != ref)
278                         {
279                                 std::ostringstream msg;
280                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
281                                 return tcu::TestStatus::fail(msg.str());
282                         }
283                 }
284         }
285         return tcu::TestStatus::pass("Compute succeeded");
286 }
287
288 class SharedVarAtomicOpTest : public vkt::TestCase
289 {
290 public:
291                                                 SharedVarAtomicOpTest   (tcu::TestContext&      testCtx,
292                                                                                                  const std::string&     name,
293                                                                                                  const std::string&     description,
294                                                                                                  const tcu::IVec3&      localSize,
295                                                                                                  const tcu::IVec3&      workSize);
296
297         void                            initPrograms                    (SourceCollections& sourceCollections) const;
298         TestInstance*           createInstance                  (Context&                       context) const;
299
300 private:
301         const tcu::IVec3        m_localSize;
302         const tcu::IVec3        m_workSize;
303 };
304
305 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
306 {
307 public:
308                                                                         SharedVarAtomicOpTestInstance   (Context&                       context,
309                                                                                                                                          const tcu::IVec3&      localSize,
310                                                                                                                                          const tcu::IVec3&      workSize);
311
312         tcu::TestStatus                                 iterate                                                 (void);
313
314 private:
315         const tcu::IVec3                                m_localSize;
316         const tcu::IVec3                                m_workSize;
317 };
318
319 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext&         testCtx,
320                                                                                           const std::string&    name,
321                                                                                           const std::string&    description,
322                                                                                           const tcu::IVec3&             localSize,
323                                                                                           const tcu::IVec3&             workSize)
324         : TestCase              (testCtx, name, description)
325         , m_localSize   (localSize)
326         , m_workSize    (workSize)
327 {
328 }
329
330 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
331 {
332         const int workGroupSize = multiplyComponents(m_localSize);
333         const int workGroupCount = multiplyComponents(m_workSize);
334         const int numValues = workGroupSize * workGroupCount;
335
336         std::ostringstream src;
337         src << "#version 310 es\n"
338                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
339                 << "layout(binding = 0) writeonly buffer Output {\n"
340                 << "    uint values[" << numValues << "];\n"
341                 << "} sb_out;\n\n"
342                 << "shared uint count;\n\n"
343                 << "void main (void) {\n"
344                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
345                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
346                 << "    uint globalOffs = localSize*globalNdx;\n"
347                 << "\n"
348                 << "    count = 0u;\n"
349                 << "    memoryBarrierShared();\n"
350                 << "    barrier();\n"
351                 << "    uint oldVal = atomicAdd(count, 1u);\n"
352                 << "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
353                 << "}\n";
354
355         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
356 }
357
358 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
359 {
360         return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
361 }
362
363 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
364         : TestInstance  (context)
365         , m_localSize   (localSize)
366         , m_workSize    (workSize)
367 {
368 }
369
370 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
371 {
372         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
373         const VkDevice                  device                          = m_context.getDevice();
374         const VkQueue                   queue                           = m_context.getUniversalQueue();
375         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
376         Allocator&                              allocator                       = m_context.getDefaultAllocator();
377
378         const int workGroupSize = multiplyComponents(m_localSize);
379         const int workGroupCount = multiplyComponents(m_workSize);
380
381         // Create a buffer and host-visible memory for it
382
383         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
384         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
385
386         // Create descriptor set
387
388         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
389                 DescriptorSetLayoutBuilder()
390                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
391                 .build(vk, device));
392
393         const Unique<VkDescriptorPool> descriptorPool(
394                 DescriptorPoolBuilder()
395                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
396                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
397
398         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
399
400         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
401         DescriptorSetUpdateBuilder()
402                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
403                 .update(vk, device);
404
405         // Perform the computation
406
407         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
408         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
409         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
410
411         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
412
413         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
414         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
415
416         // Start recording commands
417
418         beginCommandBuffer(vk, *cmdBuffer);
419
420         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
421         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
422
423         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
424
425         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
426
427         endCommandBuffer(vk, *cmdBuffer);
428
429         // Wait for completion
430
431         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
432
433         // Validate the results
434
435         const Allocation& bufferAllocation = buffer.getAllocation();
436         invalidateAlloc(vk, device, bufferAllocation);
437
438         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
439
440         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
441         {
442                 const int globalOffset = groupNdx * workGroupSize;
443                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
444                 {
445                         const deUint32 res = bufferPtr[globalOffset + localOffset];
446                         const deUint32 ref = localOffset + 1;
447
448                         if (res != ref)
449                         {
450                                 std::ostringstream msg;
451                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
452                                 return tcu::TestStatus::fail(msg.str());
453                         }
454                 }
455         }
456         return tcu::TestStatus::pass("Compute succeeded");
457 }
458
459 class SSBOLocalBarrierTest : public vkt::TestCase
460 {
461 public:
462                                                 SSBOLocalBarrierTest    (tcu::TestContext&      testCtx,
463                                                                                                  const std::string& name,
464                                                                                                  const std::string&     description,
465                                                                                                  const tcu::IVec3&      localSize,
466                                                                                                  const tcu::IVec3&      workSize);
467
468         void                            initPrograms                    (SourceCollections& sourceCollections) const;
469         TestInstance*           createInstance                  (Context&                       context) const;
470
471 private:
472         const tcu::IVec3        m_localSize;
473         const tcu::IVec3        m_workSize;
474 };
475
476 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
477 {
478 public:
479                                                                         SSBOLocalBarrierTestInstance    (Context&                       context,
480                                                                                                                                          const tcu::IVec3&      localSize,
481                                                                                                                                          const tcu::IVec3&      workSize);
482
483         tcu::TestStatus                                 iterate                                                 (void);
484
485 private:
486         const tcu::IVec3                                m_localSize;
487         const tcu::IVec3                                m_workSize;
488 };
489
490 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext&   testCtx,
491                                                                                         const std::string&      name,
492                                                                                         const std::string&      description,
493                                                                                         const tcu::IVec3&       localSize,
494                                                                                         const tcu::IVec3&       workSize)
495         : TestCase              (testCtx, name, description)
496         , m_localSize   (localSize)
497         , m_workSize    (workSize)
498 {
499 }
500
501 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
502 {
503         const int workGroupSize = multiplyComponents(m_localSize);
504         const int workGroupCount = multiplyComponents(m_workSize);
505         const int numValues = workGroupSize * workGroupCount;
506
507         std::ostringstream src;
508         src << "#version 310 es\n"
509                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
510                 << "layout(binding = 0) coherent buffer Output {\n"
511                 << "    uint values[" << numValues << "];\n"
512                 << "} sb_out;\n\n"
513                 << "void main (void) {\n"
514                 << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
515                 << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
516                 << "    uint globalOffs = localSize*globalNdx;\n"
517                 << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
518                 << "\n"
519                 << "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
520                 << "    memoryBarrierBuffer();\n"
521                 << "    barrier();\n"
522                 << "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n"         // += so we read and write
523                 << "    memoryBarrierBuffer();\n"
524                 << "    barrier();\n"
525                 << "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
526                 << "}\n";
527
528         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
529 }
530
531 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
532 {
533         return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
534 }
535
536 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
537         : TestInstance  (context)
538         , m_localSize   (localSize)
539         , m_workSize    (workSize)
540 {
541 }
542
543 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
544 {
545         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
546         const VkDevice                  device                          = m_context.getDevice();
547         const VkQueue                   queue                           = m_context.getUniversalQueue();
548         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
549         Allocator&                              allocator                       = m_context.getDefaultAllocator();
550
551         const int workGroupSize = multiplyComponents(m_localSize);
552         const int workGroupCount = multiplyComponents(m_workSize);
553
554         // Create a buffer and host-visible memory for it
555
556         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
557         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
558
559         // Create descriptor set
560
561         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
562                 DescriptorSetLayoutBuilder()
563                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
564                 .build(vk, device));
565
566         const Unique<VkDescriptorPool> descriptorPool(
567                 DescriptorPoolBuilder()
568                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
569                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
570
571         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
572
573         const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
574         DescriptorSetUpdateBuilder()
575                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
576                 .update(vk, device);
577
578         // Perform the computation
579
580         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
581         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
582         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
583
584         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
585
586         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
587         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
588
589         // Start recording commands
590
591         beginCommandBuffer(vk, *cmdBuffer);
592
593         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
594         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
595
596         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
597
598         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
599
600         endCommandBuffer(vk, *cmdBuffer);
601
602         // Wait for completion
603
604         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
605
606         // Validate the results
607
608         const Allocation& bufferAllocation = buffer.getAllocation();
609         invalidateAlloc(vk, device, bufferAllocation);
610
611         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
612
613         for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
614         {
615                 const int globalOffset = groupNdx * workGroupSize;
616                 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
617                 {
618                         const deUint32  res             = bufferPtr[globalOffset + localOffset];
619                         const int               offs0   = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
620                         const int               offs1   = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
621                         const deUint32  ref             = static_cast<deUint32>(globalOffset + offs0 + offs1);
622
623                         if (res != ref)
624                         {
625                                 std::ostringstream msg;
626                                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
627                                 return tcu::TestStatus::fail(msg.str());
628                         }
629                 }
630         }
631         return tcu::TestStatus::pass("Compute succeeded");
632 }
633
634 class CopyImageToSSBOTest : public vkt::TestCase
635 {
636 public:
637                                                 CopyImageToSSBOTest             (tcu::TestContext&      testCtx,
638                                                                                                  const std::string&     name,
639                                                                                                  const std::string&     description,
640                                                                                                  const tcu::IVec2&      localSize,
641                                                                                                  const tcu::IVec2&      imageSize);
642
643         void                            initPrograms                    (SourceCollections& sourceCollections) const;
644         TestInstance*           createInstance                  (Context&                       context) const;
645
646 private:
647         const tcu::IVec2        m_localSize;
648         const tcu::IVec2        m_imageSize;
649 };
650
651 class CopyImageToSSBOTestInstance : public vkt::TestInstance
652 {
653 public:
654                                                                         CopyImageToSSBOTestInstance             (Context&                       context,
655                                                                                                                                          const tcu::IVec2&      localSize,
656                                                                                                                                          const tcu::IVec2&      imageSize);
657
658         tcu::TestStatus                                 iterate                                                 (void);
659
660 private:
661         const tcu::IVec2                                m_localSize;
662         const tcu::IVec2                                m_imageSize;
663 };
664
665 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext&             testCtx,
666                                                                                   const std::string&    name,
667                                                                                   const std::string&    description,
668                                                                                   const tcu::IVec2&             localSize,
669                                                                                   const tcu::IVec2&             imageSize)
670         : TestCase              (testCtx, name, description)
671         , m_localSize   (localSize)
672         , m_imageSize   (imageSize)
673 {
674         DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
675         DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
676 }
677
678 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
679 {
680         std::ostringstream src;
681         src << "#version 310 es\n"
682                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
683                 << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
684                 << "layout(binding = 0) writeonly buffer Output {\n"
685                 << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
686                 << "} sb_out;\n\n"
687                 << "void main (void) {\n"
688                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
689                 << "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
690                 << "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
691                 << "}\n";
692
693         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
694 }
695
696 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
697 {
698         return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
699 }
700
701 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
702         : TestInstance  (context)
703         , m_localSize   (localSize)
704         , m_imageSize   (imageSize)
705 {
706 }
707
708 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
709 {
710         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
711         const VkDevice                  device                          = m_context.getDevice();
712         const VkQueue                   queue                           = m_context.getUniversalQueue();
713         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
714         Allocator&                              allocator                       = m_context.getDefaultAllocator();
715
716         // Create an image
717
718         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
719         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
720
721         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
722         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
723
724         // Staging buffer (source data for image)
725
726         const deUint32 imageArea = multiplyComponents(m_imageSize);
727         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
728
729         const Buffer stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
730
731         // Populate the staging buffer with test data
732         {
733                 de::Random rnd(0xab2c7);
734                 const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
735                 deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
736                 for (deUint32 i = 0; i < imageArea; ++i)
737                         *bufferPtr++ = rnd.getUint32();
738
739                 flushAlloc(vk, device, stagingBufferAllocation);
740         }
741
742         // Create a buffer to store shader output
743
744         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
745
746         // Create descriptor set
747
748         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
749                 DescriptorSetLayoutBuilder()
750                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
751                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
752                 .build(vk, device));
753
754         const Unique<VkDescriptorPool> descriptorPool(
755                 DescriptorPoolBuilder()
756                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
757                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
758                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
759
760         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
761
762         // Set the bindings
763
764         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
765         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
766
767         DescriptorSetUpdateBuilder()
768                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
769                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
770                 .update(vk, device);
771
772         // Perform the computation
773         {
774                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
775                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
776                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
777
778                 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
779                 const tcu::IVec2 workSize = m_imageSize / m_localSize;
780
781                 // Prepare the command buffer
782
783                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
784                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
785
786                 // Start recording commands
787
788                 beginCommandBuffer(vk, *cmdBuffer);
789
790                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
791                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
792
793                 const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
794                 copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
795
796                 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
797                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
798
799                 endCommandBuffer(vk, *cmdBuffer);
800
801                 // Wait for completion
802
803                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
804         }
805
806         // Validate the results
807
808         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
809         invalidateAlloc(vk, device, outputBufferAllocation);
810
811         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
812         const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
813
814         for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
815         {
816                 const deUint32 res = *(bufferPtr + ndx);
817                 const deUint32 ref = *(refBufferPtr + ndx);
818
819                 if (res != ref)
820                 {
821                         std::ostringstream msg;
822                         msg << "Comparison failed for Output.values[" << ndx << "]";
823                         return tcu::TestStatus::fail(msg.str());
824                 }
825         }
826         return tcu::TestStatus::pass("Compute succeeded");
827 }
828
829 class CopySSBOToImageTest : public vkt::TestCase
830 {
831 public:
832                                                 CopySSBOToImageTest     (tcu::TestContext&      testCtx,
833                                                                                          const std::string&     name,
834                                                                                          const std::string&     description,
835                                                                                          const tcu::IVec2&      localSize,
836                                                                                          const tcu::IVec2&      imageSize);
837
838         void                            initPrograms            (SourceCollections& sourceCollections) const;
839         TestInstance*           createInstance          (Context&                       context) const;
840
841 private:
842         const tcu::IVec2        m_localSize;
843         const tcu::IVec2        m_imageSize;
844 };
845
846 class CopySSBOToImageTestInstance : public vkt::TestInstance
847 {
848 public:
849                                                                         CopySSBOToImageTestInstance     (Context&                       context,
850                                                                                                                                  const tcu::IVec2&      localSize,
851                                                                                                                                  const tcu::IVec2&      imageSize);
852
853         tcu::TestStatus                                 iterate                                         (void);
854
855 private:
856         const tcu::IVec2                                m_localSize;
857         const tcu::IVec2                                m_imageSize;
858 };
859
860 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext&             testCtx,
861                                                                                   const std::string&    name,
862                                                                                   const std::string&    description,
863                                                                                   const tcu::IVec2&             localSize,
864                                                                                   const tcu::IVec2&             imageSize)
865         : TestCase              (testCtx, name, description)
866         , m_localSize   (localSize)
867         , m_imageSize   (imageSize)
868 {
869         DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
870         DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
871 }
872
873 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
874 {
875         std::ostringstream src;
876         src << "#version 310 es\n"
877                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
878                 << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
879                 << "layout(binding = 0) readonly buffer Input {\n"
880                 << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
881                 << "} sb_in;\n\n"
882                 << "void main (void) {\n"
883                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
884                 << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
885                 << "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
886                 << "}\n";
887
888         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
889 }
890
891 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
892 {
893         return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
894 }
895
896 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
897         : TestInstance  (context)
898         , m_localSize   (localSize)
899         , m_imageSize   (imageSize)
900 {
901 }
902
903 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
904 {
905         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
906         const VkDevice                  device                          = m_context.getDevice();
907         const VkQueue                   queue                           = m_context.getUniversalQueue();
908         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
909         Allocator&                              allocator                       = m_context.getDefaultAllocator();
910
911         // Create an image
912
913         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
914         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
915
916         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
917         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
918
919         // Create an input buffer (data to be read in the shader)
920
921         const deUint32 imageArea = multiplyComponents(m_imageSize);
922         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
923
924         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
925
926         // Populate the buffer with test data
927         {
928                 de::Random rnd(0x77238ac2);
929                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
930                 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
931                 for (deUint32 i = 0; i < imageArea; ++i)
932                         *bufferPtr++ = rnd.getUint32();
933
934                 flushAlloc(vk, device, inputBufferAllocation);
935         }
936
937         // Create a buffer to store shader output (copied from image data)
938
939         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
940
941         // Create descriptor set
942
943         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
944                 DescriptorSetLayoutBuilder()
945                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
946                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
947                 .build(vk, device));
948
949         const Unique<VkDescriptorPool> descriptorPool(
950                 DescriptorPoolBuilder()
951                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
952                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
953                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
954
955         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
956
957         // Set the bindings
958
959         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
960         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
961
962         DescriptorSetUpdateBuilder()
963                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
964                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
965                 .update(vk, device);
966
967         // Perform the computation
968         {
969                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
970                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
971                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
972
973                 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
974
975                 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
976                         0u, VK_ACCESS_SHADER_WRITE_BIT,
977                         VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
978                         *image, subresourceRange);
979
980                 const tcu::IVec2 workSize = m_imageSize / m_localSize;
981
982                 // Prepare the command buffer
983
984                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
985                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
986
987                 // Start recording commands
988
989                 beginCommandBuffer(vk, *cmdBuffer);
990
991                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
992                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
993
994                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
995                 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
996
997                 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
998
999                 endCommandBuffer(vk, *cmdBuffer);
1000
1001                 // Wait for completion
1002
1003                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1004         }
1005
1006         // Validate the results
1007
1008         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1009         invalidateAlloc(vk, device, outputBufferAllocation);
1010
1011         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1012         const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1013
1014         for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1015         {
1016                 const deUint32 res = *(bufferPtr + ndx);
1017                 const deUint32 ref = *(refBufferPtr + ndx);
1018
1019                 if (res != ref)
1020                 {
1021                         std::ostringstream msg;
1022                         msg << "Comparison failed for pixel " << ndx;
1023                         return tcu::TestStatus::fail(msg.str());
1024                 }
1025         }
1026         return tcu::TestStatus::pass("Compute succeeded");
1027 }
1028
1029 class BufferToBufferInvertTest : public vkt::TestCase
1030 {
1031 public:
1032         void                                                            initPrograms                            (SourceCollections&     sourceCollections) const;
1033         TestInstance*                                           createInstance                          (Context&                       context) const;
1034
1035         static BufferToBufferInvertTest*        UBOToSSBOInvertCase                     (tcu::TestContext&      testCtx,
1036                                                                                                                                          const std::string& name,
1037                                                                                                                                          const std::string& description,
1038                                                                                                                                          const deUint32         numValues,
1039                                                                                                                                          const tcu::IVec3&      localSize,
1040                                                                                                                                          const tcu::IVec3&      workSize);
1041
1042         static BufferToBufferInvertTest*        CopyInvertSSBOCase                      (tcu::TestContext&      testCtx,
1043                                                                                                                                          const std::string& name,
1044                                                                                                                                          const std::string& description,
1045                                                                                                                                          const deUint32         numValues,
1046                                                                                                                                          const tcu::IVec3&      localSize,
1047                                                                                                                                          const tcu::IVec3&      workSize);
1048
1049 private:
1050                                                                                 BufferToBufferInvertTest        (tcu::TestContext&      testCtx,
1051                                                                                                                                          const std::string& name,
1052                                                                                                                                          const std::string& description,
1053                                                                                                                                          const deUint32         numValues,
1054                                                                                                                                          const tcu::IVec3&      localSize,
1055                                                                                                                                          const tcu::IVec3&      workSize,
1056                                                                                                                                          const BufferType       bufferType);
1057
1058         const BufferType                                        m_bufferType;
1059         const deUint32                                          m_numValues;
1060         const tcu::IVec3                                        m_localSize;
1061         const tcu::IVec3                                        m_workSize;
1062 };
1063
1064 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1065 {
1066 public:
1067                                                                         BufferToBufferInvertTestInstance        (Context&                       context,
1068                                                                                                                                                  const deUint32         numValues,
1069                                                                                                                                                  const tcu::IVec3&      localSize,
1070                                                                                                                                                  const tcu::IVec3&      workSize,
1071                                                                                                                                                  const BufferType       bufferType);
1072
1073         tcu::TestStatus                                 iterate                                                         (void);
1074
1075 private:
1076         const BufferType                                m_bufferType;
1077         const deUint32                                  m_numValues;
1078         const tcu::IVec3                                m_localSize;
1079         const tcu::IVec3                                m_workSize;
1080 };
1081
1082 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext&   testCtx,
1083                                                                                                         const std::string&      name,
1084                                                                                                         const std::string&      description,
1085                                                                                                         const deUint32          numValues,
1086                                                                                                         const tcu::IVec3&       localSize,
1087                                                                                                         const tcu::IVec3&       workSize,
1088                                                                                                         const BufferType        bufferType)
1089         : TestCase              (testCtx, name, description)
1090         , m_bufferType  (bufferType)
1091         , m_numValues   (numValues)
1092         , m_localSize   (localSize)
1093         , m_workSize    (workSize)
1094 {
1095         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1096         DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1097 }
1098
1099 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext&      testCtx,
1100                                                                                                                                                  const std::string&     name,
1101                                                                                                                                                  const std::string&     description,
1102                                                                                                                                                  const deUint32         numValues,
1103                                                                                                                                                  const tcu::IVec3&      localSize,
1104                                                                                                                                                  const tcu::IVec3&      workSize)
1105 {
1106         return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1107 }
1108
1109 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext&       testCtx,
1110                                                                                                                                                 const std::string&      name,
1111                                                                                                                                                 const std::string&      description,
1112                                                                                                                                                 const deUint32          numValues,
1113                                                                                                                                                 const tcu::IVec3&       localSize,
1114                                                                                                                                                 const tcu::IVec3&       workSize)
1115 {
1116         return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1117 }
1118
1119 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1120 {
1121         std::ostringstream src;
1122         if (m_bufferType == BUFFER_TYPE_UNIFORM)
1123         {
1124                 src << "#version 310 es\n"
1125                         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1126                         << "layout(binding = 0) readonly uniform Input {\n"
1127                         << "    uint values[" << m_numValues << "];\n"
1128                         << "} ub_in;\n"
1129                         << "layout(binding = 1, std140) writeonly buffer Output {\n"
1130                         << "    uint values[" << m_numValues << "];\n"
1131                         << "} sb_out;\n"
1132                         << "void main (void) {\n"
1133                         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1134                         << "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1135                         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1136                         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1137                         << "\n"
1138                         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1139                         << "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1140                         << "}\n";
1141         }
1142         else if (m_bufferType == BUFFER_TYPE_SSBO)
1143         {
1144                 src << "#version 310 es\n"
1145                         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1146                         << "layout(binding = 0, std140) readonly buffer Input {\n"
1147                         << "    uint values[" << m_numValues << "];\n"
1148                         << "} sb_in;\n"
1149                         << "layout (binding = 1, std140) writeonly buffer Output {\n"
1150                         << "    uint values[" << m_numValues << "];\n"
1151                         << "} sb_out;\n"
1152                         << "void main (void) {\n"
1153                         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1154                         << "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1155                         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1156                         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1157                         << "\n"
1158                         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1159                         << "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1160                         << "}\n";
1161         }
1162
1163         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1164 }
1165
1166 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1167 {
1168         return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1169 }
1170
1171 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context&                    context,
1172                                                                                                                                         const deUint32          numValues,
1173                                                                                                                                         const tcu::IVec3&       localSize,
1174                                                                                                                                         const tcu::IVec3&       workSize,
1175                                                                                                                                         const BufferType        bufferType)
1176         : TestInstance  (context)
1177         , m_bufferType  (bufferType)
1178         , m_numValues   (numValues)
1179         , m_localSize   (localSize)
1180         , m_workSize    (workSize)
1181 {
1182 }
1183
1184 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1185 {
1186         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1187         const VkDevice                  device                          = m_context.getDevice();
1188         const VkQueue                   queue                           = m_context.getUniversalQueue();
1189         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1190         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1191
1192         // Customize the test based on buffer type
1193
1194         const VkBufferUsageFlags inputBufferUsageFlags          = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1195         const VkDescriptorType inputBufferDescriptorType        = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1196         const deUint32 randomSeed                                                       = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1197
1198         // Create an input buffer
1199
1200         const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1201         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1202
1203         // Fill the input buffer with data
1204         {
1205                 de::Random rnd(randomSeed);
1206                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1207                 tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1208                 for (deUint32 i = 0; i < m_numValues; ++i)
1209                         bufferPtr[i].x() = rnd.getUint32();
1210
1211                 flushAlloc(vk, device, inputBufferAllocation);
1212         }
1213
1214         // Create an output buffer
1215
1216         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1217
1218         // Create descriptor set
1219
1220         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1221                 DescriptorSetLayoutBuilder()
1222                 .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1223                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1224                 .build(vk, device));
1225
1226         const Unique<VkDescriptorPool> descriptorPool(
1227                 DescriptorPoolBuilder()
1228                 .addType(inputBufferDescriptorType)
1229                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1230                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1231
1232         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1233
1234         const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1235         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1236         DescriptorSetUpdateBuilder()
1237                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1238                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1239                 .update(vk, device);
1240
1241         // Perform the computation
1242
1243         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1244         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1245         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1246
1247         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1248
1249         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1250
1251         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1252         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1253
1254         // Start recording commands
1255
1256         beginCommandBuffer(vk, *cmdBuffer);
1257
1258         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1259         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1260
1261         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1262         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1263         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1264
1265         endCommandBuffer(vk, *cmdBuffer);
1266
1267         // Wait for completion
1268
1269         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1270
1271         // Validate the results
1272
1273         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1274         invalidateAlloc(vk, device, outputBufferAllocation);
1275
1276         const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1277         const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1278
1279         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1280         {
1281                 const deUint32 res = bufferPtr[ndx].x();
1282                 const deUint32 ref = ~refBufferPtr[ndx].x();
1283
1284                 if (res != ref)
1285                 {
1286                         std::ostringstream msg;
1287                         msg << "Comparison failed for Output.values[" << ndx << "]";
1288                         return tcu::TestStatus::fail(msg.str());
1289                 }
1290         }
1291         return tcu::TestStatus::pass("Compute succeeded");
1292 }
1293
1294 class InvertSSBOInPlaceTest : public vkt::TestCase
1295 {
1296 public:
1297                                                 InvertSSBOInPlaceTest   (tcu::TestContext&      testCtx,
1298                                                                                                  const std::string&     name,
1299                                                                                                  const std::string&     description,
1300                                                                                                  const deUint32         numValues,
1301                                                                                                  const bool                     sized,
1302                                                                                                  const tcu::IVec3&      localSize,
1303                                                                                                  const tcu::IVec3&      workSize);
1304
1305
1306         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1307         TestInstance*           createInstance                  (Context&                       context) const;
1308
1309 private:
1310         const deUint32          m_numValues;
1311         const bool                      m_sized;
1312         const tcu::IVec3        m_localSize;
1313         const tcu::IVec3        m_workSize;
1314 };
1315
1316 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1317 {
1318 public:
1319                                                                         InvertSSBOInPlaceTestInstance   (Context&                       context,
1320                                                                                                                                          const deUint32         numValues,
1321                                                                                                                                          const tcu::IVec3&      localSize,
1322                                                                                                                                          const tcu::IVec3&      workSize);
1323
1324         tcu::TestStatus                                 iterate                                                 (void);
1325
1326 private:
1327         const deUint32                                  m_numValues;
1328         const tcu::IVec3                                m_localSize;
1329         const tcu::IVec3                                m_workSize;
1330 };
1331
1332 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext&         testCtx,
1333                                                                                           const std::string&    name,
1334                                                                                           const std::string&    description,
1335                                                                                           const deUint32                numValues,
1336                                                                                           const bool                    sized,
1337                                                                                           const tcu::IVec3&             localSize,
1338                                                                                           const tcu::IVec3&             workSize)
1339         : TestCase              (testCtx, name, description)
1340         , m_numValues   (numValues)
1341         , m_sized               (sized)
1342         , m_localSize   (localSize)
1343         , m_workSize    (workSize)
1344 {
1345         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1346 }
1347
1348 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1349 {
1350         std::ostringstream src;
1351         src << "#version 310 es\n"
1352                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1353                 << "layout(binding = 0) buffer InOut {\n"
1354                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1355                 << "} sb_inout;\n"
1356                 << "void main (void) {\n"
1357                 << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1358                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1359                 << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1360                 << "    uint offset          = numValuesPerInv*groupNdx;\n"
1361                 << "\n"
1362                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1363                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1364                 << "}\n";
1365
1366         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1367 }
1368
1369 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1370 {
1371         return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1372 }
1373
1374 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context&                  context,
1375                                                                                                                           const deUint32        numValues,
1376                                                                                                                           const tcu::IVec3&     localSize,
1377                                                                                                                           const tcu::IVec3&     workSize)
1378         : TestInstance  (context)
1379         , m_numValues   (numValues)
1380         , m_localSize   (localSize)
1381         , m_workSize    (workSize)
1382 {
1383 }
1384
1385 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1386 {
1387         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1388         const VkDevice                  device                          = m_context.getDevice();
1389         const VkQueue                   queue                           = m_context.getUniversalQueue();
1390         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1391         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1392
1393         // Create an input/output buffer
1394
1395         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1396         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1397
1398         // Fill the buffer with data
1399
1400         typedef std::vector<deUint32> data_vector_t;
1401         data_vector_t inputData(m_numValues);
1402
1403         {
1404                 de::Random rnd(0x82ce7f);
1405                 const Allocation& bufferAllocation = buffer.getAllocation();
1406                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1407                 for (deUint32 i = 0; i < m_numValues; ++i)
1408                         inputData[i] = *bufferPtr++ = rnd.getUint32();
1409
1410                 flushAlloc(vk, device, bufferAllocation);
1411         }
1412
1413         // Create descriptor set
1414
1415         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1416                 DescriptorSetLayoutBuilder()
1417                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1418                 .build(vk, device));
1419
1420         const Unique<VkDescriptorPool> descriptorPool(
1421                 DescriptorPoolBuilder()
1422                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1423                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1424
1425         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1426
1427         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1428         DescriptorSetUpdateBuilder()
1429                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1430                 .update(vk, device);
1431
1432         // Perform the computation
1433
1434         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1435         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1436         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1437
1438         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1439
1440         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1441
1442         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1443         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1444
1445         // Start recording commands
1446
1447         beginCommandBuffer(vk, *cmdBuffer);
1448
1449         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1450         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1451
1452         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1453         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1454         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1455
1456         endCommandBuffer(vk, *cmdBuffer);
1457
1458         // Wait for completion
1459
1460         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1461
1462         // Validate the results
1463
1464         const Allocation& bufferAllocation = buffer.getAllocation();
1465         invalidateAlloc(vk, device, bufferAllocation);
1466
1467         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1468
1469         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1470         {
1471                 const deUint32 res = bufferPtr[ndx];
1472                 const deUint32 ref = ~inputData[ndx];
1473
1474                 if (res != ref)
1475                 {
1476                         std::ostringstream msg;
1477                         msg << "Comparison failed for InOut.values[" << ndx << "]";
1478                         return tcu::TestStatus::fail(msg.str());
1479                 }
1480         }
1481         return tcu::TestStatus::pass("Compute succeeded");
1482 }
1483
1484 class WriteToMultipleSSBOTest : public vkt::TestCase
1485 {
1486 public:
1487                                                 WriteToMultipleSSBOTest (tcu::TestContext&      testCtx,
1488                                                                                                  const std::string&     name,
1489                                                                                                  const std::string&     description,
1490                                                                                                  const deUint32         numValues,
1491                                                                                                  const bool                     sized,
1492                                                                                                  const tcu::IVec3&      localSize,
1493                                                                                                  const tcu::IVec3&      workSize);
1494
1495         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1496         TestInstance*           createInstance                  (Context&                       context) const;
1497
1498 private:
1499         const deUint32          m_numValues;
1500         const bool                      m_sized;
1501         const tcu::IVec3        m_localSize;
1502         const tcu::IVec3        m_workSize;
1503 };
1504
1505 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1506 {
1507 public:
1508                                                                         WriteToMultipleSSBOTestInstance (Context&                       context,
1509                                                                                                                                          const deUint32         numValues,
1510                                                                                                                                          const tcu::IVec3&      localSize,
1511                                                                                                                                          const tcu::IVec3&      workSize);
1512
1513         tcu::TestStatus                                 iterate                                                 (void);
1514
1515 private:
1516         const deUint32                                  m_numValues;
1517         const tcu::IVec3                                m_localSize;
1518         const tcu::IVec3                                m_workSize;
1519 };
1520
1521 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext&             testCtx,
1522                                                                                                   const std::string&    name,
1523                                                                                                   const std::string&    description,
1524                                                                                                   const deUint32                numValues,
1525                                                                                                   const bool                    sized,
1526                                                                                                   const tcu::IVec3&             localSize,
1527                                                                                                   const tcu::IVec3&             workSize)
1528         : TestCase              (testCtx, name, description)
1529         , m_numValues   (numValues)
1530         , m_sized               (sized)
1531         , m_localSize   (localSize)
1532         , m_workSize    (workSize)
1533 {
1534         DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1535 }
1536
1537 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1538 {
1539         std::ostringstream src;
1540         src << "#version 310 es\n"
1541                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1542                 << "layout(binding = 0) writeonly buffer Out0 {\n"
1543                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1544                 << "} sb_out0;\n"
1545                 << "layout(binding = 1) writeonly buffer Out1 {\n"
1546                 << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1547                 << "} sb_out1;\n"
1548                 << "void main (void) {\n"
1549                 << "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1550                 << "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1551                 << "\n"
1552                 << "    {\n"
1553                 << "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1554                 << "        uint offset          = numValuesPerInv*groupNdx;\n"
1555                 << "\n"
1556                 << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1557                 << "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1558                 << "    }\n"
1559                 << "    {\n"
1560                 << "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1561                 << "        uint offset          = numValuesPerInv*groupNdx;\n"
1562                 << "\n"
1563                 << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1564                 << "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1565                 << "    }\n"
1566                 << "}\n";
1567
1568         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1569 }
1570
1571 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1572 {
1573         return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1574 }
1575
1576 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context&                      context,
1577                                                                                                                                   const deUint32        numValues,
1578                                                                                                                                   const tcu::IVec3&     localSize,
1579                                                                                                                                   const tcu::IVec3&     workSize)
1580         : TestInstance  (context)
1581         , m_numValues   (numValues)
1582         , m_localSize   (localSize)
1583         , m_workSize    (workSize)
1584 {
1585 }
1586
1587 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1588 {
1589         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1590         const VkDevice                  device                          = m_context.getDevice();
1591         const VkQueue                   queue                           = m_context.getUniversalQueue();
1592         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1593         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1594
1595         // Create two output buffers
1596
1597         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1598         const Buffer buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1599         const Buffer buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1600
1601         // Create descriptor set
1602
1603         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1604                 DescriptorSetLayoutBuilder()
1605                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1606                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1607                 .build(vk, device));
1608
1609         const Unique<VkDescriptorPool> descriptorPool(
1610                 DescriptorPoolBuilder()
1611                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1612                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1613
1614         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1615
1616         const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1617         const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1618         DescriptorSetUpdateBuilder()
1619                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1620                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1621                 .update(vk, device);
1622
1623         // Perform the computation
1624
1625         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1626         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1627         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1628
1629         const VkBufferMemoryBarrier shaderWriteBarriers[] =
1630         {
1631                 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1632                 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1633         };
1634
1635         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1636         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1637
1638         // Start recording commands
1639
1640         beginCommandBuffer(vk, *cmdBuffer);
1641
1642         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1643         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1644
1645         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1646         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1647
1648         endCommandBuffer(vk, *cmdBuffer);
1649
1650         // Wait for completion
1651
1652         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1653
1654         // Validate the results
1655         {
1656                 const Allocation& buffer0Allocation = buffer0.getAllocation();
1657                 invalidateAlloc(vk, device, buffer0Allocation);
1658                 const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1659
1660                 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1661                 {
1662                         const deUint32 res = buffer0Ptr[ndx];
1663                         const deUint32 ref = ndx;
1664
1665                         if (res != ref)
1666                         {
1667                                 std::ostringstream msg;
1668                                 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1669                                 return tcu::TestStatus::fail(msg.str());
1670                         }
1671                 }
1672         }
1673         {
1674                 const Allocation& buffer1Allocation = buffer1.getAllocation();
1675                 invalidateAlloc(vk, device, buffer1Allocation);
1676                 const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1677
1678                 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1679                 {
1680                         const deUint32 res = buffer1Ptr[ndx];
1681                         const deUint32 ref = m_numValues - ndx;
1682
1683                         if (res != ref)
1684                         {
1685                                 std::ostringstream msg;
1686                                 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1687                                 return tcu::TestStatus::fail(msg.str());
1688                         }
1689                 }
1690         }
1691         return tcu::TestStatus::pass("Compute succeeded");
1692 }
1693
1694 class SSBOBarrierTest : public vkt::TestCase
1695 {
1696 public:
1697                                                 SSBOBarrierTest         (tcu::TestContext&      testCtx,
1698                                                                                          const std::string&     name,
1699                                                                                          const std::string&     description,
1700                                                                                          const tcu::IVec3&      workSize);
1701
1702         void                            initPrograms            (SourceCollections& sourceCollections) const;
1703         TestInstance*           createInstance          (Context&                       context) const;
1704
1705 private:
1706         const tcu::IVec3        m_workSize;
1707 };
1708
1709 class SSBOBarrierTestInstance : public vkt::TestInstance
1710 {
1711 public:
1712                                                                         SSBOBarrierTestInstance         (Context&                       context,
1713                                                                                                                                  const tcu::IVec3&      workSize);
1714
1715         tcu::TestStatus                                 iterate                                         (void);
1716
1717 private:
1718         const tcu::IVec3                                m_workSize;
1719 };
1720
1721 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext&             testCtx,
1722                                                                   const std::string&    name,
1723                                                                   const std::string&    description,
1724                                                                   const tcu::IVec3&             workSize)
1725         : TestCase              (testCtx, name, description)
1726         , m_workSize    (workSize)
1727 {
1728 }
1729
1730 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1731 {
1732         sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1733                 "#version 310 es\n"
1734                 "layout (local_size_x = 1) in;\n"
1735                 "layout(binding = 2) readonly uniform Constants {\n"
1736                 "    uint u_baseVal;\n"
1737                 "};\n"
1738                 "layout(binding = 1) writeonly buffer Output {\n"
1739                 "    uint values[];\n"
1740                 "};\n"
1741                 "void main (void) {\n"
1742                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1743                 "    values[offset] = u_baseVal + offset;\n"
1744                 "}\n");
1745
1746         sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1747                 "#version 310 es\n"
1748                 "layout (local_size_x = 1) in;\n"
1749                 "layout(binding = 1) readonly buffer Input {\n"
1750                 "    uint values[];\n"
1751                 "};\n"
1752                 "layout(binding = 0) coherent buffer Output {\n"
1753                 "    uint sum;\n"
1754                 "};\n"
1755                 "void main (void) {\n"
1756                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1757                 "    uint value  = values[offset];\n"
1758                 "    atomicAdd(sum, value);\n"
1759                 "}\n");
1760 }
1761
1762 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1763 {
1764         return new SSBOBarrierTestInstance(context, m_workSize);
1765 }
1766
1767 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1768         : TestInstance  (context)
1769         , m_workSize    (workSize)
1770 {
1771 }
1772
1773 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1774 {
1775         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1776         const VkDevice                  device                          = m_context.getDevice();
1777         const VkQueue                   queue                           = m_context.getUniversalQueue();
1778         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1779         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1780
1781         // Create a work buffer used by both shaders
1782
1783         const int workGroupCount = multiplyComponents(m_workSize);
1784         const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1785         const Buffer workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1786
1787         // Create an output buffer
1788
1789         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1790         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1791
1792         // Initialize atomic counter value to zero
1793         {
1794                 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1795                 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1796                 *outputBufferPtr = 0;
1797                 flushAlloc(vk, device, outputBufferAllocation);
1798         }
1799
1800         // Create a uniform buffer (to pass uniform constants)
1801
1802         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1803         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1804
1805         // Set the constants in the uniform buffer
1806
1807         const deUint32  baseValue = 127;
1808         {
1809                 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1810                 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1811                 uniformBufferPtr[0] = baseValue;
1812
1813                 flushAlloc(vk, device, uniformBufferAllocation);
1814         }
1815
1816         // Create descriptor set
1817
1818         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1819                 DescriptorSetLayoutBuilder()
1820                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1821                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1822                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1823                 .build(vk, device));
1824
1825         const Unique<VkDescriptorPool> descriptorPool(
1826                 DescriptorPoolBuilder()
1827                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1828                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1829                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1830
1831         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1832
1833         const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1834         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1835         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1836         DescriptorSetUpdateBuilder()
1837                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1838                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1839                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1840                 .update(vk, device);
1841
1842         // Perform the computation
1843
1844         const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1845         const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1846
1847         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1848         const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1849         const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1850
1851         const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1852
1853         const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1854
1855         const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1856
1857         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1858         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1859
1860         // Start recording commands
1861
1862         beginCommandBuffer(vk, *cmdBuffer);
1863
1864         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1865         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1866
1867         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1868
1869         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1870         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1871
1872         // Switch to the second shader program
1873         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1874
1875         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1876         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1877
1878         endCommandBuffer(vk, *cmdBuffer);
1879
1880         // Wait for completion
1881
1882         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1883
1884         // Validate the results
1885
1886         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1887         invalidateAlloc(vk, device, outputBufferAllocation);
1888
1889         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1890         const deUint32  res = *bufferPtr;
1891         deUint32                ref = 0;
1892
1893         for (int ndx = 0; ndx < workGroupCount; ++ndx)
1894                 ref += baseValue + ndx;
1895
1896         if (res != ref)
1897         {
1898                 std::ostringstream msg;
1899                 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1900                 return tcu::TestStatus::fail(msg.str());
1901         }
1902         return tcu::TestStatus::pass("Compute succeeded");
1903 }
1904
1905 class ImageAtomicOpTest : public vkt::TestCase
1906 {
1907 public:
1908                                                 ImageAtomicOpTest               (tcu::TestContext&      testCtx,
1909                                                                                                  const std::string& name,
1910                                                                                                  const std::string& description,
1911                                                                                                  const deUint32         localSize,
1912                                                                                                  const tcu::IVec2&      imageSize);
1913
1914         void                            initPrograms                    (SourceCollections& sourceCollections) const;
1915         TestInstance*           createInstance                  (Context&                       context) const;
1916
1917 private:
1918         const deUint32          m_localSize;
1919         const tcu::IVec2        m_imageSize;
1920 };
1921
1922 class ImageAtomicOpTestInstance : public vkt::TestInstance
1923 {
1924 public:
1925                                                                         ImageAtomicOpTestInstance               (Context&                       context,
1926                                                                                                                                          const deUint32         localSize,
1927                                                                                                                                          const tcu::IVec2&      imageSize);
1928
1929         tcu::TestStatus                                 iterate                                                 (void);
1930
1931 private:
1932         const deUint32                                  m_localSize;
1933         const tcu::IVec2                                m_imageSize;
1934 };
1935
1936 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext&         testCtx,
1937                                                                           const std::string&    name,
1938                                                                           const std::string&    description,
1939                                                                           const deUint32                localSize,
1940                                                                           const tcu::IVec2&             imageSize)
1941         : TestCase              (testCtx, name, description)
1942         , m_localSize   (localSize)
1943         , m_imageSize   (imageSize)
1944 {
1945 }
1946
1947 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1948 {
1949         std::ostringstream src;
1950         src << "#version 310 es\n"
1951                 << "#extension GL_OES_shader_image_atomic : require\n"
1952                 << "layout (local_size_x = " << m_localSize << ") in;\n"
1953                 << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1954                 << "layout(binding = 0) readonly buffer Input {\n"
1955                 << "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1956                 << "} sb_in;\n\n"
1957                 << "void main (void) {\n"
1958                 << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1959                 << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1960                 << "\n"
1961                 << "    if (gl_LocalInvocationIndex == 0u)\n"
1962                 << "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1963                 << "    memoryBarrierImage();\n"
1964                 << "    barrier();\n"
1965                 << "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1966                 << "}\n";
1967
1968         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1969 }
1970
1971 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1972 {
1973         return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1974 }
1975
1976 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1977         : TestInstance  (context)
1978         , m_localSize   (localSize)
1979         , m_imageSize   (imageSize)
1980 {
1981 }
1982
1983 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1984 {
1985         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
1986         const VkDevice                  device                          = m_context.getDevice();
1987         const VkQueue                   queue                           = m_context.getUniversalQueue();
1988         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
1989         Allocator&                              allocator                       = m_context.getDefaultAllocator();
1990
1991         // Create an image
1992
1993         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1994         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1995
1996         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1997         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
1998
1999         // Input buffer
2000
2001         const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2002         const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2003
2004         const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2005
2006         // Populate the input buffer with test data
2007         {
2008                 de::Random rnd(0x77238ac2);
2009                 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2010                 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2011                 for (deUint32 i = 0; i < numInputValues; ++i)
2012                         *bufferPtr++ = rnd.getUint32();
2013
2014                 flushAlloc(vk, device, inputBufferAllocation);
2015         }
2016
2017         // Create a buffer to store shader output (copied from image data)
2018
2019         const deUint32 imageArea = multiplyComponents(m_imageSize);
2020         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2021         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2022
2023         // Create descriptor set
2024
2025         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2026                 DescriptorSetLayoutBuilder()
2027                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2028                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2029                 .build(vk, device));
2030
2031         const Unique<VkDescriptorPool> descriptorPool(
2032                 DescriptorPoolBuilder()
2033                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2034                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2035                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2036
2037         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2038
2039         // Set the bindings
2040
2041         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2042         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2043
2044         DescriptorSetUpdateBuilder()
2045                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2046                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2047                 .update(vk, device);
2048
2049         // Perform the computation
2050         {
2051                 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2052                 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2053                 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2054
2055                 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2056
2057                 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2058                         (VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2059                         VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2060                         *image, subresourceRange);
2061
2062                 // Prepare the command buffer
2063
2064                 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2065                 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2066
2067                 // Start recording commands
2068
2069                 beginCommandBuffer(vk, *cmdBuffer);
2070
2071                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2072                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2073
2074                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2075                 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2076
2077                 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2078
2079                 endCommandBuffer(vk, *cmdBuffer);
2080
2081                 // Wait for completion
2082
2083                 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2084         }
2085
2086         // Validate the results
2087
2088         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2089         invalidateAlloc(vk, device, outputBufferAllocation);
2090
2091         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2092         const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2093
2094         for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2095         {
2096                 const deUint32  res = bufferPtr[pixelNdx];
2097                 deUint32                ref = 0;
2098
2099                 for (deUint32 offs = 0; offs < m_localSize; ++offs)
2100                         ref += refBufferPtr[pixelNdx * m_localSize + offs];
2101
2102                 if (res != ref)
2103                 {
2104                         std::ostringstream msg;
2105                         msg << "Comparison failed for pixel " << pixelNdx;
2106                         return tcu::TestStatus::fail(msg.str());
2107                 }
2108         }
2109         return tcu::TestStatus::pass("Compute succeeded");
2110 }
2111
2112 class ImageBarrierTest : public vkt::TestCase
2113 {
2114 public:
2115                                                 ImageBarrierTest        (tcu::TestContext&      testCtx,
2116                                                                                         const std::string&      name,
2117                                                                                         const std::string&      description,
2118                                                                                         const tcu::IVec2&       imageSize);
2119
2120         void                            initPrograms            (SourceCollections& sourceCollections) const;
2121         TestInstance*           createInstance          (Context&                       context) const;
2122
2123 private:
2124         const tcu::IVec2        m_imageSize;
2125 };
2126
2127 class ImageBarrierTestInstance : public vkt::TestInstance
2128 {
2129 public:
2130                                                                         ImageBarrierTestInstance        (Context&                       context,
2131                                                                                                                                  const tcu::IVec2&      imageSize);
2132
2133         tcu::TestStatus                                 iterate                                         (void);
2134
2135 private:
2136         const tcu::IVec2                                m_imageSize;
2137 };
2138
2139 ImageBarrierTest::ImageBarrierTest (tcu::TestContext&   testCtx,
2140                                                                         const std::string&      name,
2141                                                                         const std::string&      description,
2142                                                                         const tcu::IVec2&       imageSize)
2143         : TestCase              (testCtx, name, description)
2144         , m_imageSize   (imageSize)
2145 {
2146 }
2147
2148 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2149 {
2150         sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2151                 "#version 310 es\n"
2152                 "layout (local_size_x = 1) in;\n"
2153                 "layout(binding = 2) readonly uniform Constants {\n"
2154                 "    uint u_baseVal;\n"
2155                 "};\n"
2156                 "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2157                 "void main (void) {\n"
2158                 "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2159                 "    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2160                 "}\n");
2161
2162         sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2163                 "#version 310 es\n"
2164                 "layout (local_size_x = 1) in;\n"
2165                 "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2166                 "layout(binding = 0) coherent buffer Output {\n"
2167                 "    uint sum;\n"
2168                 "};\n"
2169                 "void main (void) {\n"
2170                 "    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2171                 "    atomicAdd(sum, value);\n"
2172                 "}\n");
2173 }
2174
2175 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2176 {
2177         return new ImageBarrierTestInstance(context, m_imageSize);
2178 }
2179
2180 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2181         : TestInstance  (context)
2182         , m_imageSize   (imageSize)
2183 {
2184 }
2185
2186 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2187 {
2188         const DeviceInterface&  vk                                      = m_context.getDeviceInterface();
2189         const VkDevice                  device                          = m_context.getDevice();
2190         const VkQueue                   queue                           = m_context.getUniversalQueue();
2191         const deUint32                  queueFamilyIndex        = m_context.getUniversalQueueFamilyIndex();
2192         Allocator&                              allocator                       = m_context.getDefaultAllocator();
2193
2194         // Create an image used by both shaders
2195
2196         const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2197         const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2198
2199         const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2200         const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2201
2202         // Create an output buffer
2203
2204         const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2205         const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2206
2207         // Initialize atomic counter value to zero
2208         {
2209                 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2210                 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2211                 *outputBufferPtr = 0;
2212                 flushAlloc(vk, device, outputBufferAllocation);
2213         }
2214
2215         // Create a uniform buffer (to pass uniform constants)
2216
2217         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2218         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2219
2220         // Set the constants in the uniform buffer
2221
2222         const deUint32  baseValue = 127;
2223         {
2224                 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2225                 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2226                 uniformBufferPtr[0] = baseValue;
2227
2228                 flushAlloc(vk, device, uniformBufferAllocation);
2229         }
2230
2231         // Create descriptor set
2232
2233         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2234                 DescriptorSetLayoutBuilder()
2235                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2236                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2237                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2238                 .build(vk, device));
2239
2240         const Unique<VkDescriptorPool> descriptorPool(
2241                 DescriptorPoolBuilder()
2242                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2243                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2244                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2245                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2246
2247         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2248
2249         const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2250         const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2251         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2252         DescriptorSetUpdateBuilder()
2253                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2254                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2255                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2256                 .update(vk, device);
2257
2258         // Perform the computation
2259
2260         const Unique<VkShaderModule>    shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2261         const Unique<VkShaderModule>    shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2262
2263         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2264         const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2265         const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2266
2267         const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2268
2269         const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2270                 0u, 0u,
2271                 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2272                 *image, subresourceRange);
2273
2274         const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2275                 VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2276                 VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL,
2277                 *image, subresourceRange);
2278
2279         const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2280
2281         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2282         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2283
2284         // Start recording commands
2285
2286         beginCommandBuffer(vk, *cmdBuffer);
2287
2288         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2289         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2290
2291         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2292
2293         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2294         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2295
2296         // Switch to the second shader program
2297         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2298
2299         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2300         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2301
2302         endCommandBuffer(vk, *cmdBuffer);
2303
2304         // Wait for completion
2305
2306         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2307
2308         // Validate the results
2309
2310         const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2311         invalidateAlloc(vk, device, outputBufferAllocation);
2312
2313         const int               numValues = multiplyComponents(m_imageSize);
2314         const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2315         const deUint32  res = *bufferPtr;
2316         deUint32                ref = 0;
2317
2318         for (int ndx = 0; ndx < numValues; ++ndx)
2319                 ref += baseValue + ndx;
2320
2321         if (res != ref)
2322         {
2323                 std::ostringstream msg;
2324                 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2325                 return tcu::TestStatus::fail(msg.str());
2326         }
2327         return tcu::TestStatus::pass("Compute succeeded");
2328 }
2329
2330 class ComputeTestInstance : public vkt::TestInstance
2331 {
2332 public:
2333                 ComputeTestInstance             (Context& context)
2334                 : TestInstance                  (context)
2335                 , m_numPhysDevices              (1)
2336                 , m_queueFamilyIndex    (0)
2337         {
2338                 createDeviceGroup();
2339         }
2340
2341                 ~ComputeTestInstance    ()
2342         {
2343         }
2344
2345         void                                                    createDeviceGroup       (void);
2346         const vk::DeviceInterface&              getDeviceInterface      (void)                  { return *m_deviceDriver; }
2347         vk::VkInstance                                  getInstance                     (void)                  { return m_deviceGroupInstance; }
2348         vk::VkDevice                                    getDevice                       (void)                  { return *m_logicalDevice; }
2349         vk::VkPhysicalDevice                    getPhysicalDevice       (deUint32 i = 0){ return m_physicalDevices[i]; }
2350
2351 protected:
2352         deUint32                                                        m_numPhysDevices;
2353         deUint32                                                        m_queueFamilyIndex;
2354
2355 private:
2356         CustomInstance                                          m_deviceGroupInstance;
2357         vk::Move<vk::VkDevice>                          m_logicalDevice;
2358         std::vector<vk::VkPhysicalDevice>       m_physicalDevices;
2359 #ifndef CTS_USES_VULKANSC
2360         de::MovePtr<vk::DeviceDriver>           m_deviceDriver;
2361 #else
2362         de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>        m_deviceDriver;
2363 #endif // CTS_USES_VULKANSC
2364 };
2365
2366 void ComputeTestInstance::createDeviceGroup (void)
2367 {
2368         const tcu::CommandLine&                                                 cmdLine                                 = m_context.getTestContext().getCommandLine();
2369         const deUint32                                                                  devGroupIdx                             = cmdLine.getVKDeviceGroupId() - 1;
2370         const deUint32                                                                  physDeviceIdx                   = cmdLine.getVKDeviceId() - 1;
2371         const float                                                                             queuePriority                   = 1.0f;
2372         const std::vector<std::string>                                  requiredExtensions              (1, "VK_KHR_device_group_creation");
2373         m_deviceGroupInstance                                                                                                   = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2374         std::vector<VkPhysicalDeviceGroupProperties>    devGroupProperties              = enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2375         m_numPhysDevices                                                                                                                = devGroupProperties[devGroupIdx].physicalDeviceCount;
2376         std::vector<const char*>                                                deviceExtensions;
2377
2378         if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2379                 deviceExtensions.push_back("VK_KHR_device_group");
2380
2381         VkDeviceGroupDeviceCreateInfo                                   deviceGroupInfo                 =
2382         {
2383                 VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO,                                                                      //stype
2384                 DE_NULL,                                                                                                                                                        //pNext
2385                 devGroupProperties[devGroupIdx].physicalDeviceCount,                                                            //physicalDeviceCount
2386                 devGroupProperties[devGroupIdx].physicalDevices                                                                         //physicalDevices
2387         };
2388         const InstanceDriver&                                                   instance                                (m_deviceGroupInstance.getDriver());
2389         const VkPhysicalDeviceFeatures                                  deviceFeatures                  = getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2390         const std::vector<VkQueueFamilyProperties>              queueProps                              = getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2391
2392         m_physicalDevices.resize(m_numPhysDevices);
2393         for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2394                 m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2395
2396         for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2397         {
2398                 if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2399                         m_queueFamilyIndex = (deUint32)queueNdx;
2400         }
2401
2402         VkDeviceQueueCreateInfo                                                 queueInfo                               =
2403         {
2404                 VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,             // VkStructureType                                      sType;
2405                 DE_NULL,                                                                                // const void*                                          pNext;
2406                 (VkDeviceQueueCreateFlags)0u,                                   // VkDeviceQueueCreateFlags                     flags;
2407                 m_queueFamilyIndex,                                                             // deUint32                                                     queueFamilyIndex;
2408                 1u,                                                                                             // deUint32                                                     queueCount;
2409                 &queuePriority                                                                  // const float*                                         pQueuePriorities;
2410         };
2411
2412         void* pNext                                                                                             = &deviceGroupInfo;
2413 #ifdef CTS_USES_VULKANSC
2414         VkDeviceObjectReservationCreateInfo memReservationInfo  = cmdLine.isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
2415         memReservationInfo.pNext                                                                = pNext;
2416         pNext                                                                                                   = &memReservationInfo;
2417
2418         VkPhysicalDeviceVulkanSC10Features sc10Features                 = createDefaultSC10Features();
2419         sc10Features.pNext                                                                              = pNext;
2420         pNext                                                                                                   = &sc10Features;
2421         VkPipelineCacheCreateInfo                       pcCI;
2422         std::vector<VkPipelinePoolSize>         poolSizes;
2423         if (cmdLine.isSubProcess())
2424         {
2425                 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2426                 {
2427                         pcCI =
2428                         {
2429                                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,                   // VkStructureType                              sType;
2430                                 DE_NULL,                                                                                                // const void*                                  pNext;
2431                                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2432                                         VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,   // VkPipelineCacheCreateFlags   flags;
2433                                 m_context.getResourceInterface()->getCacheDataSize(),   // deUintptr                                    initialDataSize;
2434                                 m_context.getResourceInterface()->getCacheData()                // const void*                                  pInitialData;
2435                         };
2436                         memReservationInfo.pipelineCacheCreateInfoCount         = 1;
2437                         memReservationInfo.pPipelineCacheCreateInfos            = &pcCI;
2438                 }
2439
2440                 poolSizes                                                       = m_context.getResourceInterface()->getPipelinePoolSizes();
2441                 if (!poolSizes.empty())
2442                 {
2443                         memReservationInfo.pipelinePoolSizeCount                = deUint32(poolSizes.size());
2444                         memReservationInfo.pPipelinePoolSizes                   = poolSizes.data();
2445                 }
2446         }
2447
2448 #endif // CTS_USES_VULKANSC
2449
2450         const VkDeviceCreateInfo                                                deviceInfo                              =
2451         {
2452                 VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,                                                   // VkStructureType                                      sType;
2453                 pNext,                                                                                                                  // const void*                                          pNext;
2454                 (VkDeviceCreateFlags)0,                                                                                 // VkDeviceCreateFlags                          flags;
2455                 1u      ,                                                                                                                       // uint32_t                                                     queueCreateInfoCount;
2456                 &queueInfo,                                                                                                             // const VkDeviceQueueCreateInfo*       pQueueCreateInfos;
2457                 0u,                                                                                                                             // uint32_t                                                     enabledLayerCount;
2458                 DE_NULL,                                                                                                                // const char* const*                           ppEnabledLayerNames;
2459                 deUint32(deviceExtensions.size()),                                                              // uint32_t                                                     enabledExtensionCount;
2460                 (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]),    // const char* const*                           ppEnabledExtensionNames;
2461                 &deviceFeatures,                                                                                                // const VkPhysicalDeviceFeatures*      pEnabledFeatures;
2462         };
2463
2464         m_logicalDevice         = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2465 #ifndef CTS_USES_VULKANSC
2466         m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2467 #else
2468         m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2469 #endif // CTS_USES_VULKANSC
2470 }
2471
2472 class DispatchBaseTest : public vkt::TestCase
2473 {
2474 public:
2475                                                 DispatchBaseTest        (tcu::TestContext&      testCtx,
2476                                                                                         const std::string&      name,
2477                                                                                         const std::string&      description,
2478                                                                                         const deUint32          numValues,
2479                                                                                         const tcu::IVec3&       localsize,
2480                                                                                         const tcu::IVec3&       worksize,
2481                                                                                         const tcu::IVec3&       splitsize);
2482
2483         void                            initPrograms            (SourceCollections& sourceCollections) const;
2484         TestInstance*           createInstance          (Context&                       context) const;
2485
2486 private:
2487         const deUint32                                  m_numValues;
2488         const tcu::IVec3                                m_localSize;
2489         const tcu::IVec3                                m_workSize;
2490         const tcu::IVec3                                m_splitSize;
2491 };
2492
2493 class DispatchBaseTestInstance : public ComputeTestInstance
2494 {
2495 public:
2496                                                                         DispatchBaseTestInstance        (Context&                       context,
2497                                                                                                                                 const deUint32          numValues,
2498                                                                                                                                 const tcu::IVec3&       localsize,
2499                                                                                                                                 const tcu::IVec3&       worksize,
2500                                                                                                                                 const tcu::IVec3&       splitsize);
2501
2502         bool                                                    isInputVectorValid                      (const tcu::IVec3& small, const tcu::IVec3& big);
2503         tcu::TestStatus                                 iterate                                         (void);
2504
2505 private:
2506         const deUint32                                  m_numValues;
2507         const tcu::IVec3                                m_localSize;
2508         const tcu::IVec3                                m_workSize;
2509         const tcu::IVec3                                m_splitWorkSize;
2510 };
2511
2512 DispatchBaseTest::DispatchBaseTest (tcu::TestContext&   testCtx,
2513                                                                         const std::string&      name,
2514                                                                         const std::string&      description,
2515                                                                         const deUint32          numValues,
2516                                                                         const tcu::IVec3&       localsize,
2517                                                                         const tcu::IVec3&       worksize,
2518                                                                         const tcu::IVec3&       splitsize)
2519         : TestCase              (testCtx, name, description)
2520         , m_numValues   (numValues)
2521         , m_localSize   (localsize)
2522         , m_workSize    (worksize)
2523         , m_splitSize   (splitsize)
2524 {
2525 }
2526
2527 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2528 {
2529         std::ostringstream src;
2530         src << "#version 310 es\n"
2531                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2532
2533                 << "layout(binding = 0) buffer InOut {\n"
2534                 << "    uint values[" << de::toString(m_numValues) << "];\n"
2535                 << "} sb_inout;\n"
2536
2537                 << "layout(binding = 1) readonly uniform uniformInput {\n"
2538                 << "    uvec3 gridSize;\n"
2539                 << "} ubo_in;\n"
2540
2541                 << "void main (void) {\n"
2542                 << "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2543                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2544                 << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2545                 << "    uint offset = numValuesPerInv*index;\n"
2546                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2547                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2548                 << "}\n";
2549
2550         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2551 }
2552
2553 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2554 {
2555         return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2556 }
2557
2558 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2559                                                                                                         const deUint32          numValues,
2560                                                                                                         const tcu::IVec3&       localsize,
2561                                                                                                         const tcu::IVec3&       worksize,
2562                                                                                                         const tcu::IVec3&       splitsize)
2563
2564         : ComputeTestInstance   (context)
2565         , m_numValues                   (numValues)
2566         , m_localSize                   (localsize)
2567         , m_workSize                    (worksize)
2568         , m_splitWorkSize               (splitsize)
2569 {
2570         // For easy work distribution across physical devices:
2571         // WorkSize should be a multiple of SplitWorkSize only in the X component
2572         if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2573                 (m_workSize.x() <= m_splitWorkSize.x()) ||
2574                 (m_workSize.y() != m_splitWorkSize.y()) ||
2575                 (m_workSize.z() != m_splitWorkSize.z()))
2576                 TCU_THROW(TestError, "Invalid Input.");
2577
2578         // For easy work distribution within the same physical device:
2579         // SplitWorkSize should be a multiple of localSize in Y or Z component
2580         if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2581                 (m_localSize.x() != m_splitWorkSize.x()) ||
2582                 (m_localSize.y() >= m_splitWorkSize.y()) ||
2583                 (m_localSize.z() >= m_splitWorkSize.z()))
2584                 TCU_THROW(TestError, "Invalid Input.");
2585
2586         if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2587                 TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2588
2589         deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2590         if ((totalWork > numValues) || (numValues % totalWork != 0))
2591                 TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2592 }
2593
2594 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2595 {
2596         if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2597                 ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2598                 return false;
2599         return true;
2600 }
2601
2602 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2603 {
2604         const DeviceInterface&  vk                                      = getDeviceInterface();
2605         const VkDevice                  device                          = getDevice();
2606         const VkQueue                   queue                           = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2607         SimpleAllocator                 allocator                       (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2608         deUint32                                totalWorkloadSize       = 0;
2609
2610         // Create an uniform and input/output buffer
2611         const deUint32 uniformBufSize = 3; // Pass the compute grid size
2612         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2613         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2614
2615         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2616         const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2617
2618         // Fill the buffers with data
2619         typedef std::vector<deUint32> data_vector_t;
2620         data_vector_t uniformInputData(uniformBufSize);
2621         data_vector_t inputData(m_numValues);
2622
2623         {
2624                 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2625                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2626                 uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2627                 uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2628                 uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2629                 flushAlloc(vk, device, bufferAllocation);
2630         }
2631
2632         {
2633                 de::Random rnd(0x82ce7f);
2634                 const Allocation& bufferAllocation = buffer.getAllocation();
2635                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2636                 for (deUint32 i = 0; i < m_numValues; ++i)
2637                         inputData[i] = *bufferPtr++ = rnd.getUint32();
2638
2639                 flushAlloc(vk, device, bufferAllocation);
2640         }
2641
2642         // Create descriptor set
2643         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2644                 DescriptorSetLayoutBuilder()
2645                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2646                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2647                 .build(vk, device));
2648
2649         const Unique<VkDescriptorPool> descriptorPool(
2650                 DescriptorPoolBuilder()
2651                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2652                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2653                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2654
2655         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2656
2657         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2658         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2659
2660         DescriptorSetUpdateBuilder()
2661                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2662                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2663                 .update(vk, device);
2664
2665         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2666         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2667         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2668
2669         const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2670         const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2671
2672         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2673
2674         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2675         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2676
2677         // Start recording commands
2678         beginCommandBuffer(vk, *cmdBuffer);
2679
2680         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2681         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2682
2683         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2684
2685         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2686
2687         // Split the workload across all physical devices based on m_splitWorkSize.x()
2688         for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2689         {
2690                 deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2691                 deUint32 baseGroupY = 0;
2692                 deUint32 baseGroupZ = 0;
2693
2694                 // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2695                 for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2696                 {
2697                         for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2698                         {
2699                                 deUint32 offsetX = baseGroupX;
2700                                 deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2701                                 deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2702
2703                                 deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2704                                 deUint32 localSizeY = m_localSize.y();
2705                                 deUint32 localSizeZ = m_localSize.z();
2706
2707                                 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2708                                 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2709                         }
2710                 }
2711         }
2712
2713         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2714
2715         endCommandBuffer(vk, *cmdBuffer);
2716         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2717
2718         if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2719                 TCU_THROW(TestError, "Not covering the entire workload.");
2720
2721         // Validate the results
2722         const Allocation& bufferAllocation = buffer.getAllocation();
2723         invalidateAlloc(vk, device, bufferAllocation);
2724         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2725
2726         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2727         {
2728                 const deUint32 res = bufferPtr[ndx];
2729                 const deUint32 ref = ~inputData[ndx];
2730
2731                 if (res != ref)
2732                 {
2733                         std::ostringstream msg;
2734                         msg << "Comparison failed for InOut.values[" << ndx << "]";
2735                         return tcu::TestStatus::fail(msg.str());
2736                 }
2737         }
2738         return tcu::TestStatus::pass("Compute succeeded");
2739 }
2740
2741 class DeviceIndexTest : public vkt::TestCase
2742 {
2743 public:
2744         DeviceIndexTest         (tcu::TestContext&      testCtx,
2745                                                                                         const std::string&      name,
2746                                                                                         const std::string&      description,
2747                                                                                         const deUint32          numValues,
2748                                                                                         const tcu::IVec3&       localsize,
2749                                                                                         const tcu::IVec3&       splitsize);
2750
2751         void                            initPrograms            (SourceCollections& sourceCollections) const;
2752         TestInstance*           createInstance          (Context&                       context) const;
2753
2754 private:
2755         const deUint32                                  m_numValues;
2756         const tcu::IVec3                                m_localSize;
2757         const tcu::IVec3                                m_workSize;
2758         const tcu::IVec3                                m_splitSize;
2759 };
2760
2761 class DeviceIndexTestInstance : public ComputeTestInstance
2762 {
2763 public:
2764                                                                         DeviceIndexTestInstance (Context&                       context,
2765                                                                                                                                 const deUint32          numValues,
2766                                                                                                                                 const tcu::IVec3&       localsize,
2767                                                                                                                                 const tcu::IVec3&       worksize);
2768         tcu::TestStatus                                 iterate                                         (void);
2769 private:
2770         const deUint32                                  m_numValues;
2771         const tcu::IVec3                                m_localSize;
2772         tcu::IVec3                                              m_workSize;
2773 };
2774
2775 DeviceIndexTest::DeviceIndexTest (tcu::TestContext&     testCtx,
2776                                                                         const std::string&      name,
2777                                                                         const std::string&      description,
2778                                                                         const deUint32          numValues,
2779                                                                         const tcu::IVec3&       localsize,
2780                                                                         const tcu::IVec3&       worksize)
2781         : TestCase              (testCtx, name, description)
2782         , m_numValues   (numValues)
2783         , m_localSize   (localsize)
2784         , m_workSize    (worksize)
2785 {
2786 }
2787
2788 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2789 {
2790         std::ostringstream src;
2791         src << "#version 310 es\n"
2792                 << "#extension GL_EXT_device_group : require\n"
2793                 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2794
2795                 << "layout(binding = 0) buffer InOut {\n"
2796                 << "    uint values[" << de::toString(m_numValues) << "];\n"
2797                 << "} sb_inout;\n"
2798
2799                 << "layout(binding = 1) readonly uniform uniformInput {\n"
2800                 << "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
2801                 << "} ubo_in;\n"
2802
2803                 << "void main (void) {\n"
2804                 << "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2805                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2806                 << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2807                 << "    uint offset = numValuesPerInv*index;\n"
2808                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2809                 << "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2810                 << "}\n";
2811
2812         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2813 }
2814
2815 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2816 {
2817         return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2818 }
2819
2820 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2821                                                                                                         const deUint32          numValues,
2822                                                                                                         const tcu::IVec3&       localsize,
2823                                                                                                         const tcu::IVec3&       worksize)
2824
2825         : ComputeTestInstance   (context)
2826         , m_numValues                   (numValues)
2827         , m_localSize                   (localsize)
2828         , m_workSize                    (worksize)
2829 {}
2830
2831 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2832 {
2833         const DeviceInterface&                  vk                                      = getDeviceInterface();
2834         const VkDevice                                  device                          = getDevice();
2835         const VkQueue                                   queue                           = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2836         SimpleAllocator                                 allocator                       (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2837         const deUint32                                  allocDeviceMask         = (1 << m_numPhysDevices) - 1;
2838         de::Random                                              rnd                                     (0x82ce7f);
2839         Move<VkBuffer>                                  sboBuffer;
2840         vk::Move<vk::VkDeviceMemory>    sboBufferMemory;
2841
2842         // Create an uniform and output buffer
2843         const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
2844         const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2845         const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2846
2847         const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2848         const Buffer checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2849
2850         // create SBO buffer
2851         {
2852                 const VkBufferCreateInfo        sboBufferParams =
2853                 {
2854                         VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,                                                                   // sType
2855                         DE_NULL,                                                                                                                                // pNext
2856                         0u,                                                                                                                                             // flags
2857                         (VkDeviceSize)bufferSizeBytes,                                                                                  // size
2858                         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,  // usage
2859                         VK_SHARING_MODE_EXCLUSIVE,                                                                                              // sharingMode
2860                         1u,                                                                                                                                             // queueFamilyIndexCount
2861                         &m_queueFamilyIndex,                                                                                                            // pQueueFamilyIndices
2862                 };
2863                 sboBuffer = createBuffer(vk, device, &sboBufferParams);
2864
2865                 VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2866                 deUint32 memoryTypeNdx = 0;
2867                 const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2868                 for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2869                 {
2870                         if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2871                                 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2872                                 break;
2873                 }
2874                 if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2875                         TCU_THROW(NotSupportedError, "No compatible memory type found");
2876
2877                 const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2878                 {
2879                         VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,           // sType
2880                         DE_NULL,                                                                                        // pNext
2881                         VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,                                     // flags
2882                         allocDeviceMask,                                                                        // deviceMask
2883                 };
2884
2885                 VkMemoryAllocateInfo            allocInfo =
2886                 {
2887                         VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,                 // sType
2888                         &allocDeviceMaskInfo,                                                   // pNext
2889                         memReqs.size,                                                                   // allocationSize
2890                         memoryTypeNdx,                                                                  // memoryTypeIndex
2891                 };
2892
2893                 sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2894                 VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2895         }
2896
2897         // Fill the buffers with data
2898         typedef std::vector<deUint32> data_vector_t;
2899         data_vector_t uniformInputData(uniformBufSize, 0);
2900
2901         {
2902                 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2903                 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2904                 for (deUint32 i = 0; i < uniformBufSize; ++i)
2905                         uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2906
2907                 flushAlloc(vk, device, bufferAllocation);
2908         }
2909
2910         // Create descriptor set
2911         const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2912                 DescriptorSetLayoutBuilder()
2913                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2914                 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2915                 .build(vk, device));
2916
2917         const Unique<VkDescriptorPool> descriptorPool(
2918                 DescriptorPoolBuilder()
2919                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2920                 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2921                 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2922
2923         const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2924
2925         const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2926         const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2927
2928         DescriptorSetUpdateBuilder()
2929                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2930                 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2931                 .update(vk, device);
2932
2933         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2934         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2935         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2936
2937         const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2938         const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2939
2940         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2941         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2942
2943         // Verify multiple device masks
2944         for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2945         {
2946                 deUint32 constantValPerLoop = 0;
2947                 {
2948                         const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2949                         deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2950                         constantValPerLoop = *bufferPtr = rnd.getUint32() / 10;  // divide to prevent overflow in addition
2951                         flushAlloc(vk, device, bufferAllocation);
2952                 }
2953                 beginCommandBuffer(vk, *cmdBuffer);
2954
2955                 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2956                 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2957                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2958
2959                 vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2960                 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2961
2962                 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2963
2964                 endCommandBuffer(vk, *cmdBuffer);
2965                 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2966
2967                 // Validate the results on all physical devices where compute shader was launched
2968                 const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2969                 const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2970                 const VkBufferCopy      copyParams =
2971                 {
2972                         (VkDeviceSize)0u,                                               // srcOffset
2973                         (VkDeviceSize)0u,                                               // dstOffset
2974                         bufferSizeBytes                                                 // size
2975                 };
2976
2977                 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2978                 {
2979                         if (!(1<<physDevIdx & physDevMask))
2980                                 continue;
2981
2982                         const deUint32 deviceMask = 1 << physDevIdx;
2983
2984                         beginCommandBuffer(vk, *cmdBuffer);
2985                         vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2986                         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2987                         vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
2988                         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2989
2990                         endCommandBuffer(vk, *cmdBuffer);
2991                         submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2992
2993                         const Allocation& bufferAllocation = checkBuffer.getAllocation();
2994                         invalidateAlloc(vk, device, bufferAllocation);
2995                         const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2996
2997                         for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2998                         {
2999                                 const deUint32 res = bufferPtr[ndx];
3000                                 const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
3001
3002                                 if (res != ref)
3003                                 {
3004                                         std::ostringstream msg;
3005                                         msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
3006                                         return tcu::TestStatus::fail(msg.str());
3007                                 }
3008                         }
3009                 }
3010         }
3011
3012         return tcu::TestStatus::pass("Compute succeeded");
3013 }
3014
3015 class ConcurrentCompute : public vkt::TestCase
3016 {
3017 public:
3018                                                 ConcurrentCompute       (tcu::TestContext&      testCtx,
3019                                                                                          const std::string&     name,
3020                                                                                          const std::string&     description);
3021
3022
3023         void                            initPrograms            (SourceCollections& sourceCollections) const;
3024         TestInstance*           createInstance          (Context&                       context) const;
3025 };
3026
3027 class ConcurrentComputeInstance : public vkt::TestInstance
3028 {
3029 public:
3030                                                                         ConcurrentComputeInstance       (Context& context);
3031
3032         tcu::TestStatus                                 iterate                                         (void);
3033 };
3034
3035 ConcurrentCompute::ConcurrentCompute (tcu::TestContext& testCtx,
3036                                                                           const std::string&    name,
3037                                                                           const std::string&    description)
3038         : TestCase              (testCtx, name, description)
3039 {
3040 }
3041
3042 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
3043 {
3044         std::ostringstream src;
3045         src << "#version 310 es\n"
3046                 << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3047                 << "layout(binding = 0) buffer InOut {\n"
3048                 << "    uint values[1024];\n"
3049                 << "} sb_inout;\n"
3050                 << "void main (void) {\n"
3051                 << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3052                 << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3053                 << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3054                 << "    uint offset          = numValuesPerInv*groupNdx;\n"
3055                 << "\n"
3056                 << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3057                 << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3058                 << "}\n";
3059
3060         sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3061 }
3062
3063 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3064 {
3065         return new ConcurrentComputeInstance(context);
3066 }
3067
3068 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3069         : TestInstance  (context)
3070 {
3071 }
3072
3073 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3074 {
3075         enum {
3076                 NO_MATCH_FOUND  = ~((deUint32)0),
3077                 ERROR_NONE              = 0,
3078                 ERROR_WAIT              = 1,
3079                 ERROR_ORDER             = 2
3080         };
3081
3082         struct Queues
3083         {
3084                 VkQueue         queue;
3085                 deUint32        queueFamilyIndex;
3086         };
3087
3088 //      const DeviceInterface&                                  vk                                                      = m_context.getDeviceInterface();
3089         const deUint32                                                  numValues                                       = 1024;
3090         const CustomInstance                                    instance                                        (createCustomInstanceFromContext(m_context));
3091         const InstanceDriver&                                   instanceDriver                          (instance.getDriver());
3092         const VkPhysicalDevice                                  physicalDevice                          = chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3093         tcu::TestLog&                                                   log                                                     = m_context.getTestContext().getLog();
3094         vk::Move<vk::VkDevice>                                  logicalDevice;
3095         std::vector<VkQueueFamilyProperties>    queueFamilyProperties;
3096         VkDeviceCreateInfo                                              deviceInfo;
3097         VkPhysicalDeviceFeatures                                deviceFeatures;
3098         const float                                                             queuePriorities[2]                      = {1.0f, 0.0f};
3099         VkDeviceQueueCreateInfo                                 queueInfos[2];
3100         Queues                                                                  queues[2]                                       =
3101                                                                                                                                                 {
3102                                                                                                                                                         {DE_NULL, (deUint32)NO_MATCH_FOUND},
3103                                                                                                                                                         {DE_NULL, (deUint32)NO_MATCH_FOUND}
3104                                                                                                                                                 };
3105
3106         queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3107
3108         for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3109         {
3110                 if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3111                 {
3112                         if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3113                                 queues[0].queueFamilyIndex = queueNdx;
3114
3115                         if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3116                         {
3117                                 queues[1].queueFamilyIndex = queueNdx;
3118                                 break;
3119                         }
3120                 }
3121         }
3122
3123         if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3124                 TCU_THROW(NotSupportedError, "Queues couldn't be created");
3125
3126         for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3127         {
3128                 VkDeviceQueueCreateInfo queueInfo;
3129                 deMemset(&queueInfo, 0, sizeof(queueInfo));
3130
3131                 queueInfo.sType                         = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3132                 queueInfo.pNext                         = DE_NULL;
3133                 queueInfo.flags                         = (VkDeviceQueueCreateFlags)0u;
3134                 queueInfo.queueFamilyIndex      = queues[queueNdx].queueFamilyIndex;
3135                 queueInfo.queueCount            = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3136                 queueInfo.pQueuePriorities      = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3137
3138                 queueInfos[queueNdx]            = queueInfo;
3139
3140                 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3141                         break;
3142         }
3143
3144         void* pNext                                                                                             = DE_NULL;
3145 #ifdef CTS_USES_VULKANSC
3146         VkDeviceObjectReservationCreateInfo memReservationInfo  = m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
3147         memReservationInfo.pNext                                                                = pNext;
3148         pNext                                                                                                   = &memReservationInfo;
3149
3150         VkPhysicalDeviceVulkanSC10Features sc10Features                 = createDefaultSC10Features();
3151         sc10Features.pNext                                                                              = pNext;
3152         pNext                                                                                                   = &sc10Features;
3153
3154         VkPipelineCacheCreateInfo                       pcCI;
3155         std::vector<VkPipelinePoolSize>         poolSizes;
3156         if (m_context.getTestContext().getCommandLine().isSubProcess())
3157         {
3158                 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3159                 {
3160                         pcCI =
3161                         {
3162                                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,                   // VkStructureType                              sType;
3163                                 DE_NULL,                                                                                                // const void*                                  pNext;
3164                                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3165                                         VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT,   // VkPipelineCacheCreateFlags   flags;
3166                                 m_context.getResourceInterface()->getCacheDataSize(),   // deUintptr                                    initialDataSize;
3167                                 m_context.getResourceInterface()->getCacheData()                // const void*                                  pInitialData;
3168                         };
3169                         memReservationInfo.pipelineCacheCreateInfoCount         = 1;
3170                         memReservationInfo.pPipelineCacheCreateInfos            = &pcCI;
3171                 }
3172
3173                 poolSizes                                                       = m_context.getResourceInterface()->getPipelinePoolSizes();
3174                 if (!poolSizes.empty())
3175                 {
3176                         memReservationInfo.pipelinePoolSizeCount                        = deUint32(poolSizes.size());
3177                         memReservationInfo.pPipelinePoolSizes                           = poolSizes.data();
3178                 }
3179         }
3180 #endif // CTS_USES_VULKANSC
3181
3182         deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3183         instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3184
3185         deviceInfo.sType                                        = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3186         deviceInfo.pNext                                        = pNext;
3187         deviceInfo.enabledExtensionCount        = 0u;
3188         deviceInfo.ppEnabledExtensionNames      = DE_NULL;
3189         deviceInfo.enabledLayerCount            = 0u;
3190         deviceInfo.ppEnabledLayerNames          = DE_NULL;
3191         deviceInfo.pEnabledFeatures                     = &deviceFeatures;
3192         deviceInfo.queueCreateInfoCount         = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3193         deviceInfo.pQueueCreateInfos            = queueInfos;
3194
3195         logicalDevice = createCustomDevice      (m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3196
3197 #ifndef CTS_USES_VULKANSC
3198         de::MovePtr<vk::DeviceDriver>   deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice));
3199 #else
3200         de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter>        deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3201 #endif // CTS_USES_VULKANSC
3202         vk::DeviceInterface& vk = *deviceDriver;
3203
3204         for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3205         {
3206                 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3207                         vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3208                 else
3209                         vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3210         }
3211
3212         // Create an input/output buffers
3213         const VkPhysicalDeviceMemoryProperties memoryProperties = vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3214
3215         de::MovePtr<SimpleAllocator> allocator                                  = de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3216         const VkDeviceSize bufferSizeBytes                                              = sizeof(deUint32) * numValues;
3217         const Buffer buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3218         const Buffer buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3219
3220         // Fill the buffers with data
3221
3222         typedef std::vector<deUint32> data_vector_t;
3223         data_vector_t inputData(numValues);
3224
3225         {
3226                 de::Random rnd(0x82ce7f);
3227                 const Allocation& bufferAllocation1     = buffer1.getAllocation();
3228                 const Allocation& bufferAllocation2     = buffer2.getAllocation();
3229                 deUint32* bufferPtr1                            = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3230                 deUint32* bufferPtr2                            = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3231
3232                 for (deUint32 i = 0; i < numValues; ++i)
3233                 {
3234                         deUint32 val = rnd.getUint32();
3235                         inputData[i] = val;
3236                         *bufferPtr1++ = val;
3237                         *bufferPtr2++ = val;
3238                 }
3239
3240                 flushAlloc(vk, *logicalDevice, bufferAllocation1);
3241                 flushAlloc(vk, *logicalDevice, bufferAllocation2);
3242         }
3243
3244         // Create descriptor sets
3245
3246         const Unique<VkDescriptorSetLayout>     descriptorSetLayout1(
3247                 DescriptorSetLayoutBuilder()
3248                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3249                 .build(vk, *logicalDevice));
3250
3251         const Unique<VkDescriptorPool>          descriptorPool1(
3252                 DescriptorPoolBuilder()
3253                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3254                 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3255
3256         const Unique<VkDescriptorSet>           descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3257
3258         const VkDescriptorBufferInfo            bufferDescriptorInfo1   = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3259                 DescriptorSetUpdateBuilder()
3260                 .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3261                 .update(vk, *logicalDevice);
3262
3263         const Unique<VkDescriptorSetLayout>     descriptorSetLayout2(
3264                 DescriptorSetLayoutBuilder()
3265                 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3266                 .build(vk, *logicalDevice));
3267
3268         const Unique<VkDescriptorPool>          descriptorPool2(
3269                 DescriptorPoolBuilder()
3270                 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3271                 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3272
3273         const Unique<VkDescriptorSet>           descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3274
3275         const VkDescriptorBufferInfo            bufferDescriptorInfo2   = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3276                 DescriptorSetUpdateBuilder()
3277                 .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3278                 .update(vk, *logicalDevice);
3279
3280         // Perform the computation
3281
3282         const Unique<VkShaderModule>            shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3283
3284         const Unique<VkPipelineLayout>          pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3285         const Unique<VkPipeline>                        pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3286         const VkBufferMemoryBarrier                     hostWriteBarrier1               = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3287         const VkBufferMemoryBarrier                     shaderWriteBarrier1             = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3288         const Unique<VkCommandPool>                     cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3289         const Unique<VkCommandBuffer>           cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3290
3291         const Unique<VkPipelineLayout>          pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3292         const Unique<VkPipeline>                        pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3293         const VkBufferMemoryBarrier                     hostWriteBarrier2               = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3294         const VkBufferMemoryBarrier                     shaderWriteBarrier2             = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3295         const Unique<VkCommandPool>                     cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3296         const Unique<VkCommandBuffer>           cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3297
3298         // Command buffer 1
3299
3300         beginCommandBuffer(vk, *cmdBuffer1);
3301         vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3302         vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3303         vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3304         vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3305         vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3306         endCommandBuffer(vk, *cmdBuffer1);
3307
3308         // Command buffer 2
3309
3310         beginCommandBuffer(vk, *cmdBuffer2);
3311         vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3312         vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3313         vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3314         vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3315         vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3316         endCommandBuffer(vk, *cmdBuffer2);
3317
3318         VkSubmitInfo    submitInfo1 =
3319         {
3320                 VK_STRUCTURE_TYPE_SUBMIT_INFO,                  // sType
3321                 DE_NULL,                                                                // pNext
3322                 0u,                                                                             // waitSemaphoreCount
3323                 DE_NULL,                                                                // pWaitSemaphores
3324                 (const VkPipelineStageFlags*)DE_NULL,   // pWaitDstStageMask
3325                 1u,                                                                             // commandBufferCount
3326                 &cmdBuffer1.get(),                                              // pCommandBuffers
3327                 0u,                                                                             // signalSemaphoreCount
3328                 DE_NULL                                                                 // pSignalSemaphores
3329         };
3330
3331         VkSubmitInfo    submitInfo2 =
3332         {
3333                 VK_STRUCTURE_TYPE_SUBMIT_INFO,                  // sType
3334                 DE_NULL,                                                                // pNext
3335                 0u,                                                                             // waitSemaphoreCount
3336                 DE_NULL,                                                                // pWaitSemaphores
3337                 (const VkPipelineStageFlags*)DE_NULL,   // pWaitDstStageMask
3338                 1u,                                                                             // commandBufferCount
3339                 &cmdBuffer2.get(),                                              // pCommandBuffers
3340                 0u,                                                                             // signalSemaphoreCount
3341                 DE_NULL                                                                 // pSignalSemaphores
3342         };
3343
3344         // Wait for completion
3345         const Unique<VkFence>   fence1(createFence(vk, *logicalDevice));
3346         const Unique<VkFence>   fence2(createFence(vk, *logicalDevice));
3347
3348         VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3349         VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3350
3351         int err = ERROR_NONE;
3352
3353         // First wait for the low-priority queue
3354         if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3355                 err = ERROR_WAIT;
3356
3357         // If the high-priority queue hasn't finished, we have a problem.
3358         if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3359                 if (err == ERROR_NONE)
3360                         err = ERROR_ORDER;
3361
3362         // Wait for the high-priority fence so we don't get errors on teardown.
3363         vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3364
3365         // If we fail() before waiting for all of the fences, error will come from
3366         // teardown instead of the error we want.
3367
3368         if (err == ERROR_WAIT)
3369         {
3370                 return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3371         }
3372
3373         // Validate the results
3374
3375         const Allocation& bufferAllocation1     = buffer1.getAllocation();
3376         invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3377         const deUint32* bufferPtr1                      = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3378
3379         const Allocation& bufferAllocation2     = buffer2.getAllocation();
3380         invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3381         const deUint32* bufferPtr2                      = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3382
3383         for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3384         {
3385                 const deUint32 res1     = bufferPtr1[ndx];
3386                 const deUint32 res2     = bufferPtr2[ndx];
3387                 const deUint32 inp      = inputData[ndx];
3388                 const deUint32 ref      = ~inp;
3389
3390                 if (res1 != ref || res1 != res2)
3391                 {
3392                         std::ostringstream msg;
3393                         msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3394                         return tcu::TestStatus::fail(msg.str());
3395                 }
3396         }
3397
3398         if (err == ERROR_ORDER)
3399                 log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3400
3401         return tcu::TestStatus::pass("Test passed");
3402 }
3403
3404 class EmptyWorkGroupCase : public vkt::TestCase
3405 {
3406 public:
3407                                         EmptyWorkGroupCase              (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
3408         virtual                 ~EmptyWorkGroupCase             (void) {}
3409
3410         TestInstance*   createInstance                  (Context& context) const override;
3411         void                    initPrograms                    (vk::SourceCollections& programCollection) const override;
3412
3413 protected:
3414         const tcu::UVec3 m_dispatchSize;
3415 };
3416
3417 class EmptyWorkGroupInstance : public vkt::TestInstance
3418 {
3419 public:
3420                                                 EmptyWorkGroupInstance  (Context& context, const tcu::UVec3& dispatchSize)
3421                                                         : vkt::TestInstance     (context)
3422                                                         , m_dispatchSize        (dispatchSize)
3423                                                         {}
3424         virtual                         ~EmptyWorkGroupInstance (void) {}
3425
3426         tcu::TestStatus         iterate                                 (void) override;
3427
3428 protected:
3429         const tcu::UVec3 m_dispatchSize;
3430 };
3431
3432 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3433         : vkt::TestCase         (testCtx, name, description)
3434         , m_dispatchSize        (dispatchSize)
3435 {
3436         DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3437 }
3438
3439 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3440 {
3441         return new EmptyWorkGroupInstance(context, m_dispatchSize);
3442 }
3443
3444 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3445 {
3446         std::ostringstream comp;
3447         comp
3448                 << "#version 450\n"
3449                 << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3450                 << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3451                 << "void main () { atomicAdd(verif.value, 1u); }\n"
3452                 ;
3453         programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3454 }
3455
3456 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3457 {
3458         const auto&             vkd                             = m_context.getDeviceInterface();
3459         const auto              device                  = m_context.getDevice();
3460         auto&                   alloc                   = m_context.getDefaultAllocator();
3461         const auto              queueIndex              = m_context.getUniversalQueueFamilyIndex();
3462         const auto              queue                   = m_context.getUniversalQueue();
3463
3464         const auto                      verifBufferSize         = static_cast<VkDeviceSize>(sizeof(uint32_t));
3465         const auto                      verifBufferInfo         = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3466         BufferWithMemory        verifBuffer                     (vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3467         auto&                           verifBufferAlloc        = verifBuffer.getAllocation();
3468         void*                           verifBufferPtr          = verifBufferAlloc.getHostPtr();
3469
3470         deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3471         flushAlloc(vkd, device, verifBufferAlloc);
3472
3473         DescriptorSetLayoutBuilder layoutBuilder;
3474         layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3475         const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3476
3477         const auto pipelineLayout       = makePipelineLayout(vkd, device, descriptorSetLayout.get());
3478         const auto shaderModule         = createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3479         const auto pipeline                     = makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3480
3481         DescriptorPoolBuilder poolBuilder;
3482         poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3483         const auto descriptorPool       = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3484         const auto descriptorSet        = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3485
3486         DescriptorSetUpdateBuilder updateBuilder;
3487         const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3488         updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3489         updateBuilder.update(vkd, device);
3490
3491         const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3492         const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3493         const auto cmdBuffer = cmdBufferPtr.get();
3494
3495         beginCommandBuffer(vkd, cmdBuffer);
3496         vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3497         vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3498         vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3499
3500         const auto readWriteAccess      = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3501         const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3502         vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3503
3504         vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3505
3506         const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3507         vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3508
3509         endCommandBuffer(vkd, cmdBuffer);
3510         submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3511
3512         uint32_t value;
3513         invalidateAlloc(vkd, device, verifBufferAlloc);
3514         deMemcpy(&value, verifBufferPtr, sizeof(value));
3515
3516         if (value != 1u)
3517         {
3518                 std::ostringstream msg;
3519                 msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3520                 TCU_FAIL(msg.str());
3521         }
3522
3523         return tcu::TestStatus::pass("Pass");
3524 }
3525
3526 class MaxWorkGroupSizeTest : public vkt::TestCase
3527 {
3528 public:
3529         enum class Axis { X = 0, Y = 1, Z = 2 };
3530
3531         struct Params
3532         {
3533                 // Which axis to maximize.
3534                 Axis axis;
3535         };
3536
3537                                                         MaxWorkGroupSizeTest    (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
3538         virtual                                 ~MaxWorkGroupSizeTest   (void) {}
3539
3540         virtual void                    initPrograms                    (vk::SourceCollections& programCollection) const;
3541         virtual TestInstance*   createInstance                  (Context& context) const;
3542         virtual void                    checkSupport                    (Context& context) const;
3543
3544         // Helper to transform the axis value to an index.
3545         static int                              getIndex                                (Axis axis);
3546
3547         // Helper returning the number of invocations according to the test parameters.
3548         static deUint32                 getInvocations                  (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3549
3550         // Helper returning the buffer size needed to this test.
3551         static deUint32                 getSSBOSize                             (deUint32 invocations);
3552
3553 private:
3554         Params m_params;
3555 };
3556
3557 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3558 {
3559 public:
3560                                                                 MaxWorkGroupSizeInstance        (Context& context, const MaxWorkGroupSizeTest::Params& params);
3561         virtual                                         ~MaxWorkGroupSizeInstance       (void) {}
3562
3563         virtual tcu::TestStatus         iterate                 (void);
3564
3565 private:
3566         MaxWorkGroupSizeTest::Params m_params;
3567 };
3568
3569 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3570 {
3571         const int ret = static_cast<int>(axis);
3572         DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3573         return ret;
3574 }
3575
3576 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3577 {
3578         const auto axis = getIndex(params.axis);
3579
3580         if (devProperties)
3581                 return devProperties->limits.maxComputeWorkGroupSize[axis];
3582         return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3583 }
3584
3585 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3586 {
3587         return invocations * static_cast<deUint32>(sizeof(deUint32));
3588 }
3589
3590 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3591         : vkt::TestCase (testCtx, name, description)
3592         , m_params              (params)
3593 {}
3594
3595 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3596 {
3597         std::ostringstream shader;
3598
3599         // The actual local sizes will be set using spec constants when running the test instance.
3600         shader
3601                 << "#version 450\n"
3602                 << "\n"
3603                 << "layout(constant_id=0) const int local_size_x_val = 1;\n"
3604                 << "layout(constant_id=1) const int local_size_y_val = 1;\n"
3605                 << "layout(constant_id=2) const int local_size_z_val = 1;\n"
3606                 << "\n"
3607                 << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3608                 << "\n"
3609                 << "layout(set=0, binding=0) buffer StorageBuffer {\n"
3610                 << "    uint values[];\n"
3611                 << "} ssbo;\n"
3612                 << "\n"
3613                 << "void main() {\n"
3614                 << "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3615                 << "}\n"
3616                 ;
3617
3618         programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3619 }
3620
3621 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3622 {
3623         return new MaxWorkGroupSizeInstance(context, m_params);
3624 }
3625
3626 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3627 {
3628         const auto&     vki                             = context.getInstanceInterface();
3629         const auto      physicalDevice  = context.getPhysicalDevice();
3630
3631         const auto      properties              = vk::getPhysicalDeviceProperties(vki, physicalDevice);
3632         const auto      invocations             = getInvocations(m_params, vki, physicalDevice, &properties);
3633
3634         if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3635                 TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3636
3637         if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3638                 TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3639 }
3640
3641 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3642         : vkt::TestInstance     (context)
3643         , m_params                      (params)
3644 {}
3645
3646 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3647 {
3648         const auto&     vki                             = m_context.getInstanceInterface();
3649         const auto&     vkd                             = m_context.getDeviceInterface();
3650         const auto      physicalDevice  = m_context.getPhysicalDevice();
3651         const auto      device                  = m_context.getDevice();
3652         auto&           alloc                   = m_context.getDefaultAllocator();
3653         const auto      queueIndex              = m_context.getUniversalQueueFamilyIndex();
3654         const auto      queue                   = m_context.getUniversalQueue();
3655         auto&           log                             = m_context.getTestContext().getLog();
3656
3657         const auto      axis                    = MaxWorkGroupSizeTest::getIndex(m_params.axis);
3658         const auto      invocations             = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3659         const auto      ssboSize                = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3660
3661         log
3662                 << tcu::TestLog::Message
3663                 << "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3664                 << tcu::TestLog::EndMessage
3665                 ;
3666
3667         // Main SSBO buffer.
3668         const auto                              ssboInfo        = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3669         vk::BufferWithMemory    ssbo            (vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3670
3671         // Shader module.
3672         const auto shaderModule = vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3673
3674         // Descriptor set layouts.
3675         vk::DescriptorSetLayoutBuilder layoutBuilder;
3676         layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3677         const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3678
3679         // Specialization constants: set the number of invocations in the appropriate local size id.
3680         const auto      entrySize                               = static_cast<deUintptr>(sizeof(deInt32));
3681         deInt32         specializationData[3]   = { 1, 1, 1 };
3682         specializationData[axis] = static_cast<deInt32>(invocations);
3683
3684         const vk::VkSpecializationMapEntry specializationMaps[3] =
3685         {
3686                 {
3687                         0u,                                                                             //      deUint32        constantID;
3688                         0u,                                                                             //      deUint32        offset;
3689                         entrySize,                                                              //      deUintptr       size;
3690                 },
3691                 {
3692                         1u,                                                                             //      deUint32        constantID;
3693                         static_cast<deUint32>(entrySize),               //      deUint32        offset;
3694                         entrySize,                                                              //      deUintptr       size;
3695                 },
3696                 {
3697                         2u,                                                                             //      deUint32        constantID;
3698                         static_cast<deUint32>(entrySize * 2u),  //      deUint32        offset;
3699                         entrySize,                                                              //      deUintptr       size;
3700                 },
3701         };
3702
3703         const vk::VkSpecializationInfo specializationInfo =
3704         {
3705                 3u,                                                                                                     //      deUint32                                                mapEntryCount;
3706                 specializationMaps,                                                                     //      const VkSpecializationMapEntry* pMapEntries;
3707                 static_cast<deUintptr>(sizeof(specializationData)),     //      deUintptr                                               dataSize;
3708                 specializationData,                                                                     //      const void*                                             pData;
3709         };
3710
3711         // Test pipeline.
3712         const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3713         {
3714                 vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,      //      VkStructureType                                 sType;
3715                 nullptr,                                                                                        //      const void*                                             pNext;
3716                 0u,                                                                                                     //      VkPipelineLayoutCreateFlags             flags;
3717                 1u,                                                                                                     //      deUint32                                                setLayoutCount;
3718                 &descriptorSetLayout.get(),                                                     //      const VkDescriptorSetLayout*    pSetLayouts;
3719                 0u,                                                                                                     //      deUint32                                                pushConstantRangeCount;
3720                 nullptr,                                                                                        //      const VkPushConstantRange*              pPushConstantRanges;
3721         };
3722         const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3723
3724         const vk::VkComputePipelineCreateInfo testPipelineInfo =
3725         {
3726                 vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,     //      VkStructureType                                 sType;
3727                 nullptr,                                                                                        //      const void*                                             pNext;
3728                 0u,                                                                                                     //      VkPipelineCreateFlags                   flags;
3729                 {                                                                                                       //      VkPipelineShaderStageCreateInfo stage;
3730                         vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,//      VkStructureType                                         sType;
3731                         nullptr,                                                                                                //      const void*                                                     pNext;
3732                         0u,                                                                                                             //      VkPipelineShaderStageCreateFlags        flags;
3733                         vk::VK_SHADER_STAGE_COMPUTE_BIT,                                                //      VkShaderStageFlagBits                           stage;
3734                         shaderModule.get(),                                                                             //      VkShaderModule                                          module;
3735                         "main",                                                                                                 //      const char*                                                     pName;
3736                         &specializationInfo,                                                                    //      const VkSpecializationInfo*                     pSpecializationInfo;
3737                 },
3738                 testPipelineLayout.get(),                                                       //      VkPipelineLayout                                layout;
3739                 DE_NULL,                                                                                        //      VkPipeline                                              basePipelineHandle;
3740                 0u,                                                                                                     //      deInt32                                                 basePipelineIndex;
3741         };
3742         const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3743
3744         // Create descriptor pool and set.
3745         vk::DescriptorPoolBuilder poolBuilder;
3746         poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3747         const auto descriptorPool       = poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3748         const auto descriptorSet        = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3749
3750         // Update descriptor set.
3751         const vk::VkDescriptorBufferInfo ssboBufferInfo =
3752         {
3753                 ssbo.get(),             //      VkBuffer                buffer;
3754                 0u,                             //      VkDeviceSize    offset;
3755                 VK_WHOLE_SIZE,  //      VkDeviceSize    range;
3756         };
3757
3758         vk::DescriptorSetUpdateBuilder updateBuilder;
3759         updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3760         updateBuilder.update(vkd, device);
3761
3762         // Clear buffer.
3763         auto& ssboAlloc = ssbo.getAllocation();
3764         void* ssboPtr   = ssboAlloc.getHostPtr();
3765         deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3766         vk::flushAlloc(vkd, device, ssboAlloc);
3767
3768         // Run pipelines.
3769         const auto cmdPool              = vk::makeCommandPool(vkd, device, queueIndex);
3770         const auto cmdBUfferPtr = vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3771         const auto cmdBuffer    = cmdBUfferPtr.get();
3772
3773         vk::beginCommandBuffer(vkd, cmdBuffer);
3774
3775         // Run the main test shader.
3776         const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3777         vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3778
3779         vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3780         vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3781         vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3782
3783         const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3784         vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3785
3786         vk::endCommandBuffer(vkd, cmdBuffer);
3787         vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3788
3789         // Verify buffer contents.
3790         vk::invalidateAlloc(vkd, device, ssboAlloc);
3791         std::unique_ptr<deUint32[]>     valuesArray     (new deUint32[invocations]);
3792         deUint32*                                       valuesPtr       = valuesArray.get();
3793         deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3794
3795         std::string     errorMsg;
3796         bool            ok                      = true;
3797
3798         for (size_t i = 0; i < invocations; ++i)
3799         {
3800                 if (valuesPtr[i] != 1u)
3801                 {
3802                         ok                      = false;
3803                         errorMsg        = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3804                         break;
3805                 }
3806         }
3807
3808         if (!ok)
3809                 return tcu::TestStatus::fail(errorMsg);
3810         return tcu::TestStatus::pass("Pass");
3811 }
3812
3813 namespace EmptyShaderTest
3814 {
3815
3816 void createProgram (SourceCollections& dst)
3817 {
3818         dst.glslSources.add("comp") << glu::ComputeSource(
3819                 "#version 310 es\n"
3820                 "layout (local_size_x = 1) in;\n"
3821                 "void main (void) {}\n"
3822         );
3823 }
3824
3825 tcu::TestStatus createTest (Context& context)
3826 {
3827         const DeviceInterface&  vk                                      = context.getDeviceInterface();
3828         const VkDevice                  device                          = context.getDevice();
3829         const VkQueue                   queue                           = context.getUniversalQueue();
3830         const deUint32                  queueFamilyIndex        = context.getUniversalQueueFamilyIndex();
3831
3832         const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3833
3834         const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3835         const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3836
3837         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3838         const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3839
3840         // Start recording commands
3841
3842         beginCommandBuffer(vk, *cmdBuffer);
3843
3844         vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3845
3846         const tcu::IVec3 workGroups(1, 1, 1);
3847         vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3848
3849         endCommandBuffer(vk, *cmdBuffer);
3850
3851         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3852
3853         return tcu::TestStatus::pass("Compute succeeded");
3854 }
3855
3856 } // EmptyShaderTest ns
3857 } // anonymous
3858
3859 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3860 {
3861         de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3862
3863         addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3864
3865         basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3866
3867         basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3868         basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3869         basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3870         basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3871
3872         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3873         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3874         basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3875
3876         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_single_invocation",        "Copy from UBO to SSBO, inverting bits",        256,    tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3877         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_single_group",                     "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(2,1,4),      tcu::IVec3(1,1,1)));
3878         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_multiple_invocations",     "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(1,1,1),      tcu::IVec3(2,4,1)));
3879         basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx,      "ubo_to_ssbo_multiple_groups",          "Copy from UBO to SSBO, inverting bits",        1024,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3880
3881         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_single_invocation",          "Copy between SSBOs, inverting bits",   256,    tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3882         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_multiple_invocations",       "Copy between SSBOs, inverting bits",   1024,   tcu::IVec3(1,1,1),      tcu::IVec3(2,4,1)));
3883         basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx,       "copy_ssbo_multiple_groups",            "Copy between SSBOs, inverting bits",   1024,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3884
3885         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_rw_single_invocation",                    "Read and write same SSBO",             256,    true,   tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3886         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_rw_multiple_groups",                              "Read and write same SSBO",             1024,   true,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3887         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_unsized_arr_single_invocation",   "Read and write same SSBO",             256,    false,  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3888         basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx,  "ssbo_unsized_arr_multiple_groups",             "Read and write same SSBO",             1024,   false,  tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3889
3890         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_arr_single_invocation",                 "Write to multiple SSBOs",      256,    true,   tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3891         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_arr_multiple_groups",                   "Write to multiple SSBOs",      1024,   true,   tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3892         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_unsized_arr_single_invocation", "Write to multiple SSBOs",      256,    false,  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3893         basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx,        "write_multiple_unsized_arr_multiple_groups",   "Write to multiple SSBOs",      1024,   false,  tcu::IVec3(1,4,2),      tcu::IVec3(2,2,4)));
3894
3895         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_single_invocation", "SSBO local barrier usage",     tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3896         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_single_group",              "SSBO local barrier usage",     tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3897         basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx,   "ssbo_local_barrier_multiple_groups",   "SSBO local barrier usage",     tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3898
3899         basicComputeTests->addChild(new SSBOBarrierTest(testCtx,        "ssbo_cmd_barrier_single",              "SSBO memory barrier usage",    tcu::IVec3(1,1,1)));
3900         basicComputeTests->addChild(new SSBOBarrierTest(testCtx,        "ssbo_cmd_barrier_multiple",    "SSBO memory barrier usage",    tcu::IVec3(11,5,7)));
3901
3902         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_single_invocation",         "Basic shared variable usage",  tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3903         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_single_group",                      "Basic shared variable usage",  tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3904         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_multiple_invocations",      "Basic shared variable usage",  tcu::IVec3(1,1,1),      tcu::IVec3(2,5,4)));
3905         basicComputeTests->addChild(new SharedVarTest(testCtx,  "shared_var_multiple_groups",           "Basic shared variable usage",  tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3906
3907         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_single_invocation",           "Atomic operation with shared var",             tcu::IVec3(1,1,1),      tcu::IVec3(1,1,1)));
3908         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_single_group",                        "Atomic operation with shared var",             tcu::IVec3(3,2,5),      tcu::IVec3(1,1,1)));
3909         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_multiple_invocations",        "Atomic operation with shared var",             tcu::IVec3(1,1,1),      tcu::IVec3(2,5,4)));
3910         basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx,  "shared_atomic_op_multiple_groups",                     "Atomic operation with shared var",             tcu::IVec3(3,4,1),      tcu::IVec3(2,7,3)));
3911
3912         basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,    "copy_image_to_ssbo_small",     "Image to SSBO copy",   tcu::IVec2(1,1),        tcu::IVec2(64,64)));
3913         basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx,    "copy_image_to_ssbo_large",     "Image to SSBO copy",   tcu::IVec2(2,4),        tcu::IVec2(512,512)));
3914
3915         basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,    "copy_ssbo_to_image_small",     "SSBO to image copy",   tcu::IVec2(1, 1),       tcu::IVec2(64, 64)));
3916         basicComputeTests->addChild(new CopySSBOToImageTest(testCtx,    "copy_ssbo_to_image_large",     "SSBO to image copy",   tcu::IVec2(2, 4),       tcu::IVec2(512, 512)));
3917
3918         basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,      "image_atomic_op_local_size_1", "Atomic operation with image",  1,      tcu::IVec2(64,64)));
3919         basicComputeTests->addChild(new ImageAtomicOpTest(testCtx,      "image_atomic_op_local_size_8", "Atomic operation with image",  8,      tcu::IVec2(64,64)));
3920
3921         basicComputeTests->addChild(new ImageBarrierTest(testCtx,       "image_barrier_single",         "Image barrier",        tcu::IVec2(1,1)));
3922         basicComputeTests->addChild(new ImageBarrierTest(testCtx,       "image_barrier_multiple",       "Image barrier",        tcu::IVec2(64,64)));
3923
3924 #ifndef CTS_USES_VULKANSC
3925         basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3926 #endif
3927
3928         return basicComputeTests.release();
3929 }
3930
3931 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3932 {
3933         de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3934
3935         deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base",        "Compute shader with base groups",                              32768,  tcu::IVec3(4,2,4),      tcu::IVec3(16,8,8),     tcu::IVec3(4,8,8)));
3936         deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx,  "device_index",         "Compute shader using deviceIndex in SPIRV",    96,             tcu::IVec3(3,2,1),      tcu::IVec3(2,4,1)));
3937
3938         return deviceGroupComputeTests.release();
3939
3940 }
3941 } // compute
3942 } // vkt