1 /*------------------------------------------------------------------------
2 * Vulkan Conformance Tests
3 * ------------------------
5 * Copyright (c) 2019 The Khronos Group Inc.
6 * Copyright (c) 2019 The Android Open Source Project
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
22 * \brief Compute Shader Tests
23 *//*--------------------------------------------------------------------*/
25 #include "vktComputeBasicComputeShaderTests.hpp"
26 #include "vktTestCase.hpp"
27 #include "vktTestCaseUtil.hpp"
28 #include "vktComputeTestsUtil.hpp"
29 #include "vktCustomInstancesDevices.hpp"
30 #include "vktAmberTestCase.hpp"
34 #include "vkRefUtil.hpp"
35 #include "vkPlatform.hpp"
36 #include "vkPrograms.hpp"
37 #include "vkRefUtil.hpp"
38 #include "vkMemUtil.hpp"
39 #include "vkBarrierUtil.hpp"
40 #include "vkQueryUtil.hpp"
41 #include "vkBuilderUtil.hpp"
42 #include "vkTypeUtil.hpp"
43 #include "vkDeviceUtil.hpp"
44 #include "vkCmdUtil.hpp"
45 #include "vkObjUtil.hpp"
46 #include "vkBufferWithMemory.hpp"
47 #include "vkSafetyCriticalUtil.hpp"
49 #include "tcuCommandLine.hpp"
50 #include "tcuTestLog.hpp"
52 #include "deStringUtil.hpp"
53 #include "deUniquePtr.hpp"
54 #include "deRandom.hpp"
68 template<typename T, int size>
69 T multiplyComponents (const tcu::Vector<T, size>& v)
72 for (int i = 0; i < size; ++i)
78 inline T squared (const T& a)
83 inline VkImageCreateInfo make2DImageCreateInfo (const tcu::IVec2& imageSize, const VkImageUsageFlags usage)
85 const VkImageCreateInfo imageParams =
87 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
88 DE_NULL, // const void* pNext;
89 0u, // VkImageCreateFlags flags;
90 VK_IMAGE_TYPE_2D, // VkImageType imageType;
91 VK_FORMAT_R32_UINT, // VkFormat format;
92 vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), // VkExtent3D extent;
93 1u, // deUint32 mipLevels;
94 1u, // deUint32 arrayLayers;
95 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
96 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
97 usage, // VkImageUsageFlags usage;
98 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
99 0u, // deUint32 queueFamilyIndexCount;
100 DE_NULL, // const deUint32* pQueueFamilyIndices;
101 VK_IMAGE_LAYOUT_UNDEFINED, // VkImageLayout initialLayout;
106 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2& imageSize)
108 return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
117 class SharedVarTest : public vkt::TestCase
120 SharedVarTest (tcu::TestContext& testCtx,
121 const std::string& name,
122 const std::string& description,
123 const tcu::IVec3& localSize,
124 const tcu::IVec3& workSize);
126 void initPrograms (SourceCollections& sourceCollections) const;
127 TestInstance* createInstance (Context& context) const;
130 const tcu::IVec3 m_localSize;
131 const tcu::IVec3 m_workSize;
134 class SharedVarTestInstance : public vkt::TestInstance
137 SharedVarTestInstance (Context& context,
138 const tcu::IVec3& localSize,
139 const tcu::IVec3& workSize);
141 tcu::TestStatus iterate (void);
144 const tcu::IVec3 m_localSize;
145 const tcu::IVec3 m_workSize;
148 SharedVarTest::SharedVarTest (tcu::TestContext& testCtx,
149 const std::string& name,
150 const std::string& description,
151 const tcu::IVec3& localSize,
152 const tcu::IVec3& workSize)
153 : TestCase (testCtx, name, description)
154 , m_localSize (localSize)
155 , m_workSize (workSize)
159 void SharedVarTest::initPrograms (SourceCollections& sourceCollections) const
161 const int workGroupSize = multiplyComponents(m_localSize);
162 const int workGroupCount = multiplyComponents(m_workSize);
163 const int numValues = workGroupSize * workGroupCount;
165 std::ostringstream src;
166 src << "#version 310 es\n"
167 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
168 << "layout(binding = 0) writeonly buffer Output {\n"
169 << " uint values[" << numValues << "];\n"
171 << "shared uint offsets[" << workGroupSize << "];\n\n"
172 << "void main (void) {\n"
173 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
174 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
175 << " uint globalOffs = localSize*globalNdx;\n"
176 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
178 << " offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
179 << " memoryBarrierShared();\n"
181 << " sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
184 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
187 TestInstance* SharedVarTest::createInstance (Context& context) const
189 return new SharedVarTestInstance(context, m_localSize, m_workSize);
192 SharedVarTestInstance::SharedVarTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
193 : TestInstance (context)
194 , m_localSize (localSize)
195 , m_workSize (workSize)
199 tcu::TestStatus SharedVarTestInstance::iterate (void)
201 const DeviceInterface& vk = m_context.getDeviceInterface();
202 const VkDevice device = m_context.getDevice();
203 const VkQueue queue = m_context.getUniversalQueue();
204 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
205 Allocator& allocator = m_context.getDefaultAllocator();
207 const int workGroupSize = multiplyComponents(m_localSize);
208 const int workGroupCount = multiplyComponents(m_workSize);
210 // Create a buffer and host-visible memory for it
212 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
213 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
215 // Create descriptor set
217 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
218 DescriptorSetLayoutBuilder()
219 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
222 const Unique<VkDescriptorPool> descriptorPool(
223 DescriptorPoolBuilder()
224 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
225 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
227 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
229 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
230 DescriptorSetUpdateBuilder()
231 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
234 // Perform the computation
236 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
237 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
238 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
240 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
242 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
243 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
245 // Start recording commands
247 beginCommandBuffer(vk, *cmdBuffer);
249 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
250 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
252 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
254 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
256 endCommandBuffer(vk, *cmdBuffer);
258 // Wait for completion
260 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
262 // Validate the results
264 const Allocation& bufferAllocation = buffer.getAllocation();
265 invalidateAlloc(vk, device, bufferAllocation);
267 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
269 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
271 const int globalOffset = groupNdx * workGroupSize;
272 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
274 const deUint32 res = bufferPtr[globalOffset + localOffset];
275 const deUint32 ref = globalOffset + squared(workGroupSize - localOffset - 1);
279 std::ostringstream msg;
280 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
281 return tcu::TestStatus::fail(msg.str());
285 return tcu::TestStatus::pass("Compute succeeded");
288 class SharedVarAtomicOpTest : public vkt::TestCase
291 SharedVarAtomicOpTest (tcu::TestContext& testCtx,
292 const std::string& name,
293 const std::string& description,
294 const tcu::IVec3& localSize,
295 const tcu::IVec3& workSize);
297 void initPrograms (SourceCollections& sourceCollections) const;
298 TestInstance* createInstance (Context& context) const;
301 const tcu::IVec3 m_localSize;
302 const tcu::IVec3 m_workSize;
305 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
308 SharedVarAtomicOpTestInstance (Context& context,
309 const tcu::IVec3& localSize,
310 const tcu::IVec3& workSize);
312 tcu::TestStatus iterate (void);
315 const tcu::IVec3 m_localSize;
316 const tcu::IVec3 m_workSize;
319 SharedVarAtomicOpTest::SharedVarAtomicOpTest (tcu::TestContext& testCtx,
320 const std::string& name,
321 const std::string& description,
322 const tcu::IVec3& localSize,
323 const tcu::IVec3& workSize)
324 : TestCase (testCtx, name, description)
325 , m_localSize (localSize)
326 , m_workSize (workSize)
330 void SharedVarAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
332 const int workGroupSize = multiplyComponents(m_localSize);
333 const int workGroupCount = multiplyComponents(m_workSize);
334 const int numValues = workGroupSize * workGroupCount;
336 std::ostringstream src;
337 src << "#version 310 es\n"
338 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
339 << "layout(binding = 0) writeonly buffer Output {\n"
340 << " uint values[" << numValues << "];\n"
342 << "shared uint count;\n\n"
343 << "void main (void) {\n"
344 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
345 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
346 << " uint globalOffs = localSize*globalNdx;\n"
349 << " memoryBarrierShared();\n"
351 << " uint oldVal = atomicAdd(count, 1u);\n"
352 << " sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
355 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
358 TestInstance* SharedVarAtomicOpTest::createInstance (Context& context) const
360 return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize);
363 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
364 : TestInstance (context)
365 , m_localSize (localSize)
366 , m_workSize (workSize)
370 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate (void)
372 const DeviceInterface& vk = m_context.getDeviceInterface();
373 const VkDevice device = m_context.getDevice();
374 const VkQueue queue = m_context.getUniversalQueue();
375 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
376 Allocator& allocator = m_context.getDefaultAllocator();
378 const int workGroupSize = multiplyComponents(m_localSize);
379 const int workGroupCount = multiplyComponents(m_workSize);
381 // Create a buffer and host-visible memory for it
383 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
384 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
386 // Create descriptor set
388 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
389 DescriptorSetLayoutBuilder()
390 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
393 const Unique<VkDescriptorPool> descriptorPool(
394 DescriptorPoolBuilder()
395 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
396 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
398 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
400 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
401 DescriptorSetUpdateBuilder()
402 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
405 // Perform the computation
407 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
408 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
409 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
411 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
413 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
414 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
416 // Start recording commands
418 beginCommandBuffer(vk, *cmdBuffer);
420 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
421 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
423 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
425 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1u, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
427 endCommandBuffer(vk, *cmdBuffer);
429 // Wait for completion
431 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
433 // Validate the results
435 const Allocation& bufferAllocation = buffer.getAllocation();
436 invalidateAlloc(vk, device, bufferAllocation);
438 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
440 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
442 const int globalOffset = groupNdx * workGroupSize;
443 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
445 const deUint32 res = bufferPtr[globalOffset + localOffset];
446 const deUint32 ref = localOffset + 1;
450 std::ostringstream msg;
451 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
452 return tcu::TestStatus::fail(msg.str());
456 return tcu::TestStatus::pass("Compute succeeded");
459 class SSBOLocalBarrierTest : public vkt::TestCase
462 SSBOLocalBarrierTest (tcu::TestContext& testCtx,
463 const std::string& name,
464 const std::string& description,
465 const tcu::IVec3& localSize,
466 const tcu::IVec3& workSize);
468 void initPrograms (SourceCollections& sourceCollections) const;
469 TestInstance* createInstance (Context& context) const;
472 const tcu::IVec3 m_localSize;
473 const tcu::IVec3 m_workSize;
476 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
479 SSBOLocalBarrierTestInstance (Context& context,
480 const tcu::IVec3& localSize,
481 const tcu::IVec3& workSize);
483 tcu::TestStatus iterate (void);
486 const tcu::IVec3 m_localSize;
487 const tcu::IVec3 m_workSize;
490 SSBOLocalBarrierTest::SSBOLocalBarrierTest (tcu::TestContext& testCtx,
491 const std::string& name,
492 const std::string& description,
493 const tcu::IVec3& localSize,
494 const tcu::IVec3& workSize)
495 : TestCase (testCtx, name, description)
496 , m_localSize (localSize)
497 , m_workSize (workSize)
501 void SSBOLocalBarrierTest::initPrograms (SourceCollections& sourceCollections) const
503 const int workGroupSize = multiplyComponents(m_localSize);
504 const int workGroupCount = multiplyComponents(m_workSize);
505 const int numValues = workGroupSize * workGroupCount;
507 std::ostringstream src;
508 src << "#version 310 es\n"
509 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
510 << "layout(binding = 0) coherent buffer Output {\n"
511 << " uint values[" << numValues << "];\n"
513 << "void main (void) {\n"
514 << " uint localSize = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
515 << " uint globalNdx = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
516 << " uint globalOffs = localSize*globalNdx;\n"
517 << " uint localOffs = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
519 << " sb_out.values[globalOffs + localOffs] = globalOffs;\n"
520 << " memoryBarrierBuffer();\n"
522 << " sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n" // += so we read and write
523 << " memoryBarrierBuffer();\n"
525 << " sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
528 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
531 TestInstance* SSBOLocalBarrierTest::createInstance (Context& context) const
533 return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize);
536 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance (Context& context, const tcu::IVec3& localSize, const tcu::IVec3& workSize)
537 : TestInstance (context)
538 , m_localSize (localSize)
539 , m_workSize (workSize)
543 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate (void)
545 const DeviceInterface& vk = m_context.getDeviceInterface();
546 const VkDevice device = m_context.getDevice();
547 const VkQueue queue = m_context.getUniversalQueue();
548 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
549 Allocator& allocator = m_context.getDefaultAllocator();
551 const int workGroupSize = multiplyComponents(m_localSize);
552 const int workGroupCount = multiplyComponents(m_workSize);
554 // Create a buffer and host-visible memory for it
556 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * workGroupSize * workGroupCount;
557 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
559 // Create descriptor set
561 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
562 DescriptorSetLayoutBuilder()
563 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
566 const Unique<VkDescriptorPool> descriptorPool(
567 DescriptorPoolBuilder()
568 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
569 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
571 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
573 const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
574 DescriptorSetUpdateBuilder()
575 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
578 // Perform the computation
580 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
581 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
582 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
584 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
586 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
587 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
589 // Start recording commands
591 beginCommandBuffer(vk, *cmdBuffer);
593 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
594 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
596 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
598 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
600 endCommandBuffer(vk, *cmdBuffer);
602 // Wait for completion
604 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
606 // Validate the results
608 const Allocation& bufferAllocation = buffer.getAllocation();
609 invalidateAlloc(vk, device, bufferAllocation);
611 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
613 for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
615 const int globalOffset = groupNdx * workGroupSize;
616 for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
618 const deUint32 res = bufferPtr[globalOffset + localOffset];
619 const int offs0 = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) : ((localOffset - 1) % workGroupSize);
620 const int offs1 = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) : ((localOffset - 2) % workGroupSize);
621 const deUint32 ref = static_cast<deUint32>(globalOffset + offs0 + offs1);
625 std::ostringstream msg;
626 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
627 return tcu::TestStatus::fail(msg.str());
631 return tcu::TestStatus::pass("Compute succeeded");
634 class CopyImageToSSBOTest : public vkt::TestCase
637 CopyImageToSSBOTest (tcu::TestContext& testCtx,
638 const std::string& name,
639 const std::string& description,
640 const tcu::IVec2& localSize,
641 const tcu::IVec2& imageSize);
643 void initPrograms (SourceCollections& sourceCollections) const;
644 TestInstance* createInstance (Context& context) const;
647 const tcu::IVec2 m_localSize;
648 const tcu::IVec2 m_imageSize;
651 class CopyImageToSSBOTestInstance : public vkt::TestInstance
654 CopyImageToSSBOTestInstance (Context& context,
655 const tcu::IVec2& localSize,
656 const tcu::IVec2& imageSize);
658 tcu::TestStatus iterate (void);
661 const tcu::IVec2 m_localSize;
662 const tcu::IVec2 m_imageSize;
665 CopyImageToSSBOTest::CopyImageToSSBOTest (tcu::TestContext& testCtx,
666 const std::string& name,
667 const std::string& description,
668 const tcu::IVec2& localSize,
669 const tcu::IVec2& imageSize)
670 : TestCase (testCtx, name, description)
671 , m_localSize (localSize)
672 , m_imageSize (imageSize)
674 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
675 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
678 void CopyImageToSSBOTest::initPrograms (SourceCollections& sourceCollections) const
680 std::ostringstream src;
681 src << "#version 310 es\n"
682 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
683 << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
684 << "layout(binding = 0) writeonly buffer Output {\n"
685 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
687 << "void main (void) {\n"
688 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
689 << " uint value = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
690 << " sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
693 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
696 TestInstance* CopyImageToSSBOTest::createInstance (Context& context) const
698 return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize);
701 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
702 : TestInstance (context)
703 , m_localSize (localSize)
704 , m_imageSize (imageSize)
708 tcu::TestStatus CopyImageToSSBOTestInstance::iterate (void)
710 const DeviceInterface& vk = m_context.getDeviceInterface();
711 const VkDevice device = m_context.getDevice();
712 const VkQueue queue = m_context.getUniversalQueue();
713 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
714 Allocator& allocator = m_context.getDefaultAllocator();
718 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
719 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
721 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
722 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
724 // Staging buffer (source data for image)
726 const deUint32 imageArea = multiplyComponents(m_imageSize);
727 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
729 const Buffer stagingBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT), MemoryRequirement::HostVisible);
731 // Populate the staging buffer with test data
733 de::Random rnd(0xab2c7);
734 const Allocation& stagingBufferAllocation = stagingBuffer.getAllocation();
735 deUint32* bufferPtr = static_cast<deUint32*>(stagingBufferAllocation.getHostPtr());
736 for (deUint32 i = 0; i < imageArea; ++i)
737 *bufferPtr++ = rnd.getUint32();
739 flushAlloc(vk, device, stagingBufferAllocation);
742 // Create a buffer to store shader output
744 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
746 // Create descriptor set
748 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
749 DescriptorSetLayoutBuilder()
750 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
751 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
754 const Unique<VkDescriptorPool> descriptorPool(
755 DescriptorPoolBuilder()
756 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
757 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
758 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
760 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
764 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
765 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
767 DescriptorSetUpdateBuilder()
768 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
769 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
772 // Perform the computation
774 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
775 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
776 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
778 const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
779 const tcu::IVec2 workSize = m_imageSize / m_localSize;
781 // Prepare the command buffer
783 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
784 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
786 // Start recording commands
788 beginCommandBuffer(vk, *cmdBuffer);
790 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
791 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
793 const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
794 copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT, 1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
796 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
797 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &computeFinishBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
799 endCommandBuffer(vk, *cmdBuffer);
801 // Wait for completion
803 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
806 // Validate the results
808 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
809 invalidateAlloc(vk, device, outputBufferAllocation);
811 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
812 const deUint32* refBufferPtr = static_cast<deUint32*>(stagingBuffer.getAllocation().getHostPtr());
814 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
816 const deUint32 res = *(bufferPtr + ndx);
817 const deUint32 ref = *(refBufferPtr + ndx);
821 std::ostringstream msg;
822 msg << "Comparison failed for Output.values[" << ndx << "]";
823 return tcu::TestStatus::fail(msg.str());
826 return tcu::TestStatus::pass("Compute succeeded");
829 class CopySSBOToImageTest : public vkt::TestCase
832 CopySSBOToImageTest (tcu::TestContext& testCtx,
833 const std::string& name,
834 const std::string& description,
835 const tcu::IVec2& localSize,
836 const tcu::IVec2& imageSize);
838 void initPrograms (SourceCollections& sourceCollections) const;
839 TestInstance* createInstance (Context& context) const;
842 const tcu::IVec2 m_localSize;
843 const tcu::IVec2 m_imageSize;
846 class CopySSBOToImageTestInstance : public vkt::TestInstance
849 CopySSBOToImageTestInstance (Context& context,
850 const tcu::IVec2& localSize,
851 const tcu::IVec2& imageSize);
853 tcu::TestStatus iterate (void);
856 const tcu::IVec2 m_localSize;
857 const tcu::IVec2 m_imageSize;
860 CopySSBOToImageTest::CopySSBOToImageTest (tcu::TestContext& testCtx,
861 const std::string& name,
862 const std::string& description,
863 const tcu::IVec2& localSize,
864 const tcu::IVec2& imageSize)
865 : TestCase (testCtx, name, description)
866 , m_localSize (localSize)
867 , m_imageSize (imageSize)
869 DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
870 DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
873 void CopySSBOToImageTest::initPrograms (SourceCollections& sourceCollections) const
875 std::ostringstream src;
876 src << "#version 310 es\n"
877 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
878 << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
879 << "layout(binding = 0) readonly buffer Input {\n"
880 << " uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
882 << "void main (void) {\n"
883 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
884 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
885 << " imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
888 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
891 TestInstance* CopySSBOToImageTest::createInstance (Context& context) const
893 return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize);
896 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance (Context& context, const tcu::IVec2& localSize, const tcu::IVec2& imageSize)
897 : TestInstance (context)
898 , m_localSize (localSize)
899 , m_imageSize (imageSize)
903 tcu::TestStatus CopySSBOToImageTestInstance::iterate (void)
905 const DeviceInterface& vk = m_context.getDeviceInterface();
906 const VkDevice device = m_context.getDevice();
907 const VkQueue queue = m_context.getUniversalQueue();
908 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
909 Allocator& allocator = m_context.getDefaultAllocator();
913 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
914 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
916 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
917 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
919 // Create an input buffer (data to be read in the shader)
921 const deUint32 imageArea = multiplyComponents(m_imageSize);
922 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * imageArea;
924 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
926 // Populate the buffer with test data
928 de::Random rnd(0x77238ac2);
929 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
930 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
931 for (deUint32 i = 0; i < imageArea; ++i)
932 *bufferPtr++ = rnd.getUint32();
934 flushAlloc(vk, device, inputBufferAllocation);
937 // Create a buffer to store shader output (copied from image data)
939 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
941 // Create descriptor set
943 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
944 DescriptorSetLayoutBuilder()
945 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
946 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
949 const Unique<VkDescriptorPool> descriptorPool(
950 DescriptorPoolBuilder()
951 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
952 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
953 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
955 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
959 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
960 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
962 DescriptorSetUpdateBuilder()
963 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
964 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
967 // Perform the computation
969 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
970 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
971 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
973 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
975 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
976 0u, VK_ACCESS_SHADER_WRITE_BIT,
977 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
978 *image, subresourceRange);
980 const tcu::IVec2 workSize = m_imageSize / m_localSize;
982 // Prepare the command buffer
984 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
985 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
987 // Start recording commands
989 beginCommandBuffer(vk, *cmdBuffer);
991 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
992 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
994 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
995 vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
997 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
999 endCommandBuffer(vk, *cmdBuffer);
1001 // Wait for completion
1003 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1006 // Validate the results
1008 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1009 invalidateAlloc(vk, device, outputBufferAllocation);
1011 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1012 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
1014 for (deUint32 ndx = 0; ndx < imageArea; ++ndx)
1016 const deUint32 res = *(bufferPtr + ndx);
1017 const deUint32 ref = *(refBufferPtr + ndx);
1021 std::ostringstream msg;
1022 msg << "Comparison failed for pixel " << ndx;
1023 return tcu::TestStatus::fail(msg.str());
1026 return tcu::TestStatus::pass("Compute succeeded");
1029 class BufferToBufferInvertTest : public vkt::TestCase
1032 void initPrograms (SourceCollections& sourceCollections) const;
1033 TestInstance* createInstance (Context& context) const;
1035 static BufferToBufferInvertTest* UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1036 const std::string& name,
1037 const std::string& description,
1038 const deUint32 numValues,
1039 const tcu::IVec3& localSize,
1040 const tcu::IVec3& workSize);
1042 static BufferToBufferInvertTest* CopyInvertSSBOCase (tcu::TestContext& testCtx,
1043 const std::string& name,
1044 const std::string& description,
1045 const deUint32 numValues,
1046 const tcu::IVec3& localSize,
1047 const tcu::IVec3& workSize);
1050 BufferToBufferInvertTest (tcu::TestContext& testCtx,
1051 const std::string& name,
1052 const std::string& description,
1053 const deUint32 numValues,
1054 const tcu::IVec3& localSize,
1055 const tcu::IVec3& workSize,
1056 const BufferType bufferType);
1058 const BufferType m_bufferType;
1059 const deUint32 m_numValues;
1060 const tcu::IVec3 m_localSize;
1061 const tcu::IVec3 m_workSize;
1064 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1067 BufferToBufferInvertTestInstance (Context& context,
1068 const deUint32 numValues,
1069 const tcu::IVec3& localSize,
1070 const tcu::IVec3& workSize,
1071 const BufferType bufferType);
1073 tcu::TestStatus iterate (void);
1076 const BufferType m_bufferType;
1077 const deUint32 m_numValues;
1078 const tcu::IVec3 m_localSize;
1079 const tcu::IVec3 m_workSize;
1082 BufferToBufferInvertTest::BufferToBufferInvertTest (tcu::TestContext& testCtx,
1083 const std::string& name,
1084 const std::string& description,
1085 const deUint32 numValues,
1086 const tcu::IVec3& localSize,
1087 const tcu::IVec3& workSize,
1088 const BufferType bufferType)
1089 : TestCase (testCtx, name, description)
1090 , m_bufferType (bufferType)
1091 , m_numValues (numValues)
1092 , m_localSize (localSize)
1093 , m_workSize (workSize)
1095 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1096 DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1099 BufferToBufferInvertTest* BufferToBufferInvertTest::UBOToSSBOInvertCase (tcu::TestContext& testCtx,
1100 const std::string& name,
1101 const std::string& description,
1102 const deUint32 numValues,
1103 const tcu::IVec3& localSize,
1104 const tcu::IVec3& workSize)
1106 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM);
1109 BufferToBufferInvertTest* BufferToBufferInvertTest::CopyInvertSSBOCase (tcu::TestContext& testCtx,
1110 const std::string& name,
1111 const std::string& description,
1112 const deUint32 numValues,
1113 const tcu::IVec3& localSize,
1114 const tcu::IVec3& workSize)
1116 return new BufferToBufferInvertTest(testCtx, name, description, numValues, localSize, workSize, BUFFER_TYPE_SSBO);
1119 void BufferToBufferInvertTest::initPrograms (SourceCollections& sourceCollections) const
1121 std::ostringstream src;
1122 if (m_bufferType == BUFFER_TYPE_UNIFORM)
1124 src << "#version 310 es\n"
1125 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1126 << "layout(binding = 0) readonly uniform Input {\n"
1127 << " uint values[" << m_numValues << "];\n"
1129 << "layout(binding = 1, std140) writeonly buffer Output {\n"
1130 << " uint values[" << m_numValues << "];\n"
1132 << "void main (void) {\n"
1133 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1134 << " uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1135 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1136 << " uint offset = numValuesPerInv*groupNdx;\n"
1138 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1139 << " sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1142 else if (m_bufferType == BUFFER_TYPE_SSBO)
1144 src << "#version 310 es\n"
1145 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1146 << "layout(binding = 0, std140) readonly buffer Input {\n"
1147 << " uint values[" << m_numValues << "];\n"
1149 << "layout (binding = 1, std140) writeonly buffer Output {\n"
1150 << " uint values[" << m_numValues << "];\n"
1152 << "void main (void) {\n"
1153 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1154 << " uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1155 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1156 << " uint offset = numValuesPerInv*groupNdx;\n"
1158 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1159 << " sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1163 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1166 TestInstance* BufferToBufferInvertTest::createInstance (Context& context) const
1168 return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType);
1171 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance (Context& context,
1172 const deUint32 numValues,
1173 const tcu::IVec3& localSize,
1174 const tcu::IVec3& workSize,
1175 const BufferType bufferType)
1176 : TestInstance (context)
1177 , m_bufferType (bufferType)
1178 , m_numValues (numValues)
1179 , m_localSize (localSize)
1180 , m_workSize (workSize)
1184 tcu::TestStatus BufferToBufferInvertTestInstance::iterate (void)
1186 const DeviceInterface& vk = m_context.getDeviceInterface();
1187 const VkDevice device = m_context.getDevice();
1188 const VkQueue queue = m_context.getUniversalQueue();
1189 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1190 Allocator& allocator = m_context.getDefaultAllocator();
1192 // Customize the test based on buffer type
1194 const VkBufferUsageFlags inputBufferUsageFlags = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1195 const VkDescriptorType inputBufferDescriptorType = (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1196 const deUint32 randomSeed = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1198 // Create an input buffer
1200 const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1201 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags), MemoryRequirement::HostVisible);
1203 // Fill the input buffer with data
1205 de::Random rnd(randomSeed);
1206 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
1207 tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(inputBufferAllocation.getHostPtr());
1208 for (deUint32 i = 0; i < m_numValues; ++i)
1209 bufferPtr[i].x() = rnd.getUint32();
1211 flushAlloc(vk, device, inputBufferAllocation);
1214 // Create an output buffer
1216 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1218 // Create descriptor set
1220 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1221 DescriptorSetLayoutBuilder()
1222 .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1223 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1224 .build(vk, device));
1226 const Unique<VkDescriptorPool> descriptorPool(
1227 DescriptorPoolBuilder()
1228 .addType(inputBufferDescriptorType)
1229 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1230 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1232 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1234 const VkDescriptorBufferInfo inputBufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1235 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1236 DescriptorSetUpdateBuilder()
1237 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType, &inputBufferDescriptorInfo)
1238 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1239 .update(vk, device);
1241 // Perform the computation
1243 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1244 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1245 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1247 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1249 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1251 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1252 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1254 // Start recording commands
1256 beginCommandBuffer(vk, *cmdBuffer);
1258 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1259 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1261 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1262 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1263 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1265 endCommandBuffer(vk, *cmdBuffer);
1267 // Wait for completion
1269 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1271 // Validate the results
1273 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1274 invalidateAlloc(vk, device, outputBufferAllocation);
1276 const tcu::UVec4* bufferPtr = static_cast<tcu::UVec4*>(outputBufferAllocation.getHostPtr());
1277 const tcu::UVec4* refBufferPtr = static_cast<tcu::UVec4*>(inputBuffer.getAllocation().getHostPtr());
1279 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1281 const deUint32 res = bufferPtr[ndx].x();
1282 const deUint32 ref = ~refBufferPtr[ndx].x();
1286 std::ostringstream msg;
1287 msg << "Comparison failed for Output.values[" << ndx << "]";
1288 return tcu::TestStatus::fail(msg.str());
1291 return tcu::TestStatus::pass("Compute succeeded");
1294 class InvertSSBOInPlaceTest : public vkt::TestCase
1297 InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1298 const std::string& name,
1299 const std::string& description,
1300 const deUint32 numValues,
1302 const tcu::IVec3& localSize,
1303 const tcu::IVec3& workSize);
1306 void initPrograms (SourceCollections& sourceCollections) const;
1307 TestInstance* createInstance (Context& context) const;
1310 const deUint32 m_numValues;
1312 const tcu::IVec3 m_localSize;
1313 const tcu::IVec3 m_workSize;
1316 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1319 InvertSSBOInPlaceTestInstance (Context& context,
1320 const deUint32 numValues,
1321 const tcu::IVec3& localSize,
1322 const tcu::IVec3& workSize);
1324 tcu::TestStatus iterate (void);
1327 const deUint32 m_numValues;
1328 const tcu::IVec3 m_localSize;
1329 const tcu::IVec3 m_workSize;
1332 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest (tcu::TestContext& testCtx,
1333 const std::string& name,
1334 const std::string& description,
1335 const deUint32 numValues,
1337 const tcu::IVec3& localSize,
1338 const tcu::IVec3& workSize)
1339 : TestCase (testCtx, name, description)
1340 , m_numValues (numValues)
1342 , m_localSize (localSize)
1343 , m_workSize (workSize)
1345 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1348 void InvertSSBOInPlaceTest::initPrograms (SourceCollections& sourceCollections) const
1350 std::ostringstream src;
1351 src << "#version 310 es\n"
1352 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1353 << "layout(binding = 0) buffer InOut {\n"
1354 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1356 << "void main (void) {\n"
1357 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1358 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1359 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1360 << " uint offset = numValuesPerInv*groupNdx;\n"
1362 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1363 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1366 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1369 TestInstance* InvertSSBOInPlaceTest::createInstance (Context& context) const
1371 return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize);
1374 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance (Context& context,
1375 const deUint32 numValues,
1376 const tcu::IVec3& localSize,
1377 const tcu::IVec3& workSize)
1378 : TestInstance (context)
1379 , m_numValues (numValues)
1380 , m_localSize (localSize)
1381 , m_workSize (workSize)
1385 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate (void)
1387 const DeviceInterface& vk = m_context.getDeviceInterface();
1388 const VkDevice device = m_context.getDevice();
1389 const VkQueue queue = m_context.getUniversalQueue();
1390 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1391 Allocator& allocator = m_context.getDefaultAllocator();
1393 // Create an input/output buffer
1395 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1396 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1398 // Fill the buffer with data
1400 typedef std::vector<deUint32> data_vector_t;
1401 data_vector_t inputData(m_numValues);
1404 de::Random rnd(0x82ce7f);
1405 const Allocation& bufferAllocation = buffer.getAllocation();
1406 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1407 for (deUint32 i = 0; i < m_numValues; ++i)
1408 inputData[i] = *bufferPtr++ = rnd.getUint32();
1410 flushAlloc(vk, device, bufferAllocation);
1413 // Create descriptor set
1415 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1416 DescriptorSetLayoutBuilder()
1417 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1418 .build(vk, device));
1420 const Unique<VkDescriptorPool> descriptorPool(
1421 DescriptorPoolBuilder()
1422 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1423 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1425 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1427 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1428 DescriptorSetUpdateBuilder()
1429 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1430 .update(vk, device);
1432 // Perform the computation
1434 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1435 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1436 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1438 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1440 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1442 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1443 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1445 // Start recording commands
1447 beginCommandBuffer(vk, *cmdBuffer);
1449 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1450 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1452 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1453 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1454 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1456 endCommandBuffer(vk, *cmdBuffer);
1458 // Wait for completion
1460 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1462 // Validate the results
1464 const Allocation& bufferAllocation = buffer.getAllocation();
1465 invalidateAlloc(vk, device, bufferAllocation);
1467 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
1469 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1471 const deUint32 res = bufferPtr[ndx];
1472 const deUint32 ref = ~inputData[ndx];
1476 std::ostringstream msg;
1477 msg << "Comparison failed for InOut.values[" << ndx << "]";
1478 return tcu::TestStatus::fail(msg.str());
1481 return tcu::TestStatus::pass("Compute succeeded");
1484 class WriteToMultipleSSBOTest : public vkt::TestCase
1487 WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1488 const std::string& name,
1489 const std::string& description,
1490 const deUint32 numValues,
1492 const tcu::IVec3& localSize,
1493 const tcu::IVec3& workSize);
1495 void initPrograms (SourceCollections& sourceCollections) const;
1496 TestInstance* createInstance (Context& context) const;
1499 const deUint32 m_numValues;
1501 const tcu::IVec3 m_localSize;
1502 const tcu::IVec3 m_workSize;
1505 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1508 WriteToMultipleSSBOTestInstance (Context& context,
1509 const deUint32 numValues,
1510 const tcu::IVec3& localSize,
1511 const tcu::IVec3& workSize);
1513 tcu::TestStatus iterate (void);
1516 const deUint32 m_numValues;
1517 const tcu::IVec3 m_localSize;
1518 const tcu::IVec3 m_workSize;
1521 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest (tcu::TestContext& testCtx,
1522 const std::string& name,
1523 const std::string& description,
1524 const deUint32 numValues,
1526 const tcu::IVec3& localSize,
1527 const tcu::IVec3& workSize)
1528 : TestCase (testCtx, name, description)
1529 , m_numValues (numValues)
1531 , m_localSize (localSize)
1532 , m_workSize (workSize)
1534 DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1537 void WriteToMultipleSSBOTest::initPrograms (SourceCollections& sourceCollections) const
1539 std::ostringstream src;
1540 src << "#version 310 es\n"
1541 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
1542 << "layout(binding = 0) writeonly buffer Out0 {\n"
1543 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1545 << "layout(binding = 1) writeonly buffer Out1 {\n"
1546 << " uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1548 << "void main (void) {\n"
1549 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1550 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
1553 << " uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1554 << " uint offset = numValuesPerInv*groupNdx;\n"
1556 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1557 << " sb_out0.values[offset + ndx] = offset + ndx;\n"
1560 << " uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1561 << " uint offset = numValuesPerInv*groupNdx;\n"
1563 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1564 << " sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1568 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1571 TestInstance* WriteToMultipleSSBOTest::createInstance (Context& context) const
1573 return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize);
1576 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance (Context& context,
1577 const deUint32 numValues,
1578 const tcu::IVec3& localSize,
1579 const tcu::IVec3& workSize)
1580 : TestInstance (context)
1581 , m_numValues (numValues)
1582 , m_localSize (localSize)
1583 , m_workSize (workSize)
1587 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate (void)
1589 const DeviceInterface& vk = m_context.getDeviceInterface();
1590 const VkDevice device = m_context.getDevice();
1591 const VkQueue queue = m_context.getUniversalQueue();
1592 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1593 Allocator& allocator = m_context.getDefaultAllocator();
1595 // Create two output buffers
1597 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
1598 const Buffer buffer0(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1599 const Buffer buffer1(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1601 // Create descriptor set
1603 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1604 DescriptorSetLayoutBuilder()
1605 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1606 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1607 .build(vk, device));
1609 const Unique<VkDescriptorPool> descriptorPool(
1610 DescriptorPoolBuilder()
1611 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1612 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1614 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1616 const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1617 const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1618 DescriptorSetUpdateBuilder()
1619 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1620 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1621 .update(vk, device);
1623 // Perform the computation
1625 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
1626 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1627 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
1629 const VkBufferMemoryBarrier shaderWriteBarriers[] =
1631 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1632 makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)
1635 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1636 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1638 // Start recording commands
1640 beginCommandBuffer(vk, *cmdBuffer);
1642 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1643 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1645 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1646 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0, (const VkImageMemoryBarrier*)DE_NULL);
1648 endCommandBuffer(vk, *cmdBuffer);
1650 // Wait for completion
1652 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1654 // Validate the results
1656 const Allocation& buffer0Allocation = buffer0.getAllocation();
1657 invalidateAlloc(vk, device, buffer0Allocation);
1658 const deUint32* buffer0Ptr = static_cast<deUint32*>(buffer0Allocation.getHostPtr());
1660 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1662 const deUint32 res = buffer0Ptr[ndx];
1663 const deUint32 ref = ndx;
1667 std::ostringstream msg;
1668 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1669 return tcu::TestStatus::fail(msg.str());
1674 const Allocation& buffer1Allocation = buffer1.getAllocation();
1675 invalidateAlloc(vk, device, buffer1Allocation);
1676 const deUint32* buffer1Ptr = static_cast<deUint32*>(buffer1Allocation.getHostPtr());
1678 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
1680 const deUint32 res = buffer1Ptr[ndx];
1681 const deUint32 ref = m_numValues - ndx;
1685 std::ostringstream msg;
1686 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1687 return tcu::TestStatus::fail(msg.str());
1691 return tcu::TestStatus::pass("Compute succeeded");
1694 class SSBOBarrierTest : public vkt::TestCase
1697 SSBOBarrierTest (tcu::TestContext& testCtx,
1698 const std::string& name,
1699 const std::string& description,
1700 const tcu::IVec3& workSize);
1702 void initPrograms (SourceCollections& sourceCollections) const;
1703 TestInstance* createInstance (Context& context) const;
1706 const tcu::IVec3 m_workSize;
1709 class SSBOBarrierTestInstance : public vkt::TestInstance
1712 SSBOBarrierTestInstance (Context& context,
1713 const tcu::IVec3& workSize);
1715 tcu::TestStatus iterate (void);
1718 const tcu::IVec3 m_workSize;
1721 SSBOBarrierTest::SSBOBarrierTest (tcu::TestContext& testCtx,
1722 const std::string& name,
1723 const std::string& description,
1724 const tcu::IVec3& workSize)
1725 : TestCase (testCtx, name, description)
1726 , m_workSize (workSize)
1730 void SSBOBarrierTest::initPrograms (SourceCollections& sourceCollections) const
1732 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
1734 "layout (local_size_x = 1) in;\n"
1735 "layout(binding = 2) readonly uniform Constants {\n"
1736 " uint u_baseVal;\n"
1738 "layout(binding = 1) writeonly buffer Output {\n"
1741 "void main (void) {\n"
1742 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1743 " values[offset] = u_baseVal + offset;\n"
1746 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
1748 "layout (local_size_x = 1) in;\n"
1749 "layout(binding = 1) readonly buffer Input {\n"
1752 "layout(binding = 0) coherent buffer Output {\n"
1755 "void main (void) {\n"
1756 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1757 " uint value = values[offset];\n"
1758 " atomicAdd(sum, value);\n"
1762 TestInstance* SSBOBarrierTest::createInstance (Context& context) const
1764 return new SSBOBarrierTestInstance(context, m_workSize);
1767 SSBOBarrierTestInstance::SSBOBarrierTestInstance (Context& context, const tcu::IVec3& workSize)
1768 : TestInstance (context)
1769 , m_workSize (workSize)
1773 tcu::TestStatus SSBOBarrierTestInstance::iterate (void)
1775 const DeviceInterface& vk = m_context.getDeviceInterface();
1776 const VkDevice device = m_context.getDevice();
1777 const VkQueue queue = m_context.getUniversalQueue();
1778 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1779 Allocator& allocator = m_context.getDefaultAllocator();
1781 // Create a work buffer used by both shaders
1783 const int workGroupCount = multiplyComponents(m_workSize);
1784 const VkDeviceSize workBufferSizeBytes = sizeof(deUint32) * workGroupCount;
1785 const Buffer workBuffer(vk, device, allocator, makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::Any);
1787 // Create an output buffer
1789 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
1790 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
1792 // Initialize atomic counter value to zero
1794 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1795 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1796 *outputBufferPtr = 0;
1797 flushAlloc(vk, device, outputBufferAllocation);
1800 // Create a uniform buffer (to pass uniform constants)
1802 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
1803 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
1805 // Set the constants in the uniform buffer
1807 const deUint32 baseValue = 127;
1809 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
1810 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
1811 uniformBufferPtr[0] = baseValue;
1813 flushAlloc(vk, device, uniformBufferAllocation);
1816 // Create descriptor set
1818 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1819 DescriptorSetLayoutBuilder()
1820 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1821 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1822 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1823 .build(vk, device));
1825 const Unique<VkDescriptorPool> descriptorPool(
1826 DescriptorPoolBuilder()
1827 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1828 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1829 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1831 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1833 const VkDescriptorBufferInfo workBufferDescriptorInfo = makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1834 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1835 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1836 DescriptorSetUpdateBuilder()
1837 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1838 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
1839 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
1840 .update(vk, device);
1842 // Perform the computation
1844 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
1845 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
1847 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
1848 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
1849 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
1851 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
1853 const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
1855 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
1857 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1858 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1860 // Start recording commands
1862 beginCommandBuffer(vk, *cmdBuffer);
1864 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
1865 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
1867 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1869 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1870 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &betweenShadersBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1872 // Switch to the second shader program
1873 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
1875 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1876 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
1878 endCommandBuffer(vk, *cmdBuffer);
1880 // Wait for completion
1882 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1884 // Validate the results
1886 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
1887 invalidateAlloc(vk, device, outputBufferAllocation);
1889 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
1890 const deUint32 res = *bufferPtr;
1893 for (int ndx = 0; ndx < workGroupCount; ++ndx)
1894 ref += baseValue + ndx;
1898 std::ostringstream msg;
1899 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
1900 return tcu::TestStatus::fail(msg.str());
1902 return tcu::TestStatus::pass("Compute succeeded");
1905 class ImageAtomicOpTest : public vkt::TestCase
1908 ImageAtomicOpTest (tcu::TestContext& testCtx,
1909 const std::string& name,
1910 const std::string& description,
1911 const deUint32 localSize,
1912 const tcu::IVec2& imageSize);
1914 void initPrograms (SourceCollections& sourceCollections) const;
1915 TestInstance* createInstance (Context& context) const;
1918 const deUint32 m_localSize;
1919 const tcu::IVec2 m_imageSize;
1922 class ImageAtomicOpTestInstance : public vkt::TestInstance
1925 ImageAtomicOpTestInstance (Context& context,
1926 const deUint32 localSize,
1927 const tcu::IVec2& imageSize);
1929 tcu::TestStatus iterate (void);
1932 const deUint32 m_localSize;
1933 const tcu::IVec2 m_imageSize;
1936 ImageAtomicOpTest::ImageAtomicOpTest (tcu::TestContext& testCtx,
1937 const std::string& name,
1938 const std::string& description,
1939 const deUint32 localSize,
1940 const tcu::IVec2& imageSize)
1941 : TestCase (testCtx, name, description)
1942 , m_localSize (localSize)
1943 , m_imageSize (imageSize)
1947 void ImageAtomicOpTest::initPrograms (SourceCollections& sourceCollections) const
1949 std::ostringstream src;
1950 src << "#version 310 es\n"
1951 << "#extension GL_OES_shader_image_atomic : require\n"
1952 << "layout (local_size_x = " << m_localSize << ") in;\n"
1953 << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
1954 << "layout(binding = 0) readonly buffer Input {\n"
1955 << " uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
1957 << "void main (void) {\n"
1958 << " uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
1959 << " uint value = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
1961 << " if (gl_LocalInvocationIndex == 0u)\n"
1962 << " imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
1963 << " memoryBarrierImage();\n"
1965 << " imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
1968 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1971 TestInstance* ImageAtomicOpTest::createInstance (Context& context) const
1973 return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize);
1976 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance (Context& context, const deUint32 localSize, const tcu::IVec2& imageSize)
1977 : TestInstance (context)
1978 , m_localSize (localSize)
1979 , m_imageSize (imageSize)
1983 tcu::TestStatus ImageAtomicOpTestInstance::iterate (void)
1985 const DeviceInterface& vk = m_context.getDeviceInterface();
1986 const VkDevice device = m_context.getDevice();
1987 const VkQueue queue = m_context.getUniversalQueue();
1988 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1989 Allocator& allocator = m_context.getDefaultAllocator();
1993 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
1994 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
1996 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1997 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2001 const deUint32 numInputValues = multiplyComponents(m_imageSize) * m_localSize;
2002 const VkDeviceSize inputBufferSizeBytes = sizeof(deUint32) * numInputValues;
2004 const Buffer inputBuffer(vk, device, allocator, makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2006 // Populate the input buffer with test data
2008 de::Random rnd(0x77238ac2);
2009 const Allocation& inputBufferAllocation = inputBuffer.getAllocation();
2010 deUint32* bufferPtr = static_cast<deUint32*>(inputBufferAllocation.getHostPtr());
2011 for (deUint32 i = 0; i < numInputValues; ++i)
2012 *bufferPtr++ = rnd.getUint32();
2014 flushAlloc(vk, device, inputBufferAllocation);
2017 // Create a buffer to store shader output (copied from image data)
2019 const deUint32 imageArea = multiplyComponents(m_imageSize);
2020 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32) * imageArea;
2021 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2023 // Create descriptor set
2025 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2026 DescriptorSetLayoutBuilder()
2027 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2028 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2029 .build(vk, device));
2031 const Unique<VkDescriptorPool> descriptorPool(
2032 DescriptorPoolBuilder()
2033 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2034 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2035 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2037 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2041 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2042 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2044 DescriptorSetUpdateBuilder()
2045 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2046 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2047 .update(vk, device);
2049 // Perform the computation
2051 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2052 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2053 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2055 const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2057 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2058 (VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT,
2059 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2060 *image, subresourceRange);
2062 // Prepare the command buffer
2064 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2065 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2067 // Start recording commands
2069 beginCommandBuffer(vk, *cmdBuffer);
2071 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2072 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2074 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2075 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2077 copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
2079 endCommandBuffer(vk, *cmdBuffer);
2081 // Wait for completion
2083 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2086 // Validate the results
2088 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2089 invalidateAlloc(vk, device, outputBufferAllocation);
2091 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2092 const deUint32* refBufferPtr = static_cast<deUint32*>(inputBuffer.getAllocation().getHostPtr());
2094 for (deUint32 pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2096 const deUint32 res = bufferPtr[pixelNdx];
2099 for (deUint32 offs = 0; offs < m_localSize; ++offs)
2100 ref += refBufferPtr[pixelNdx * m_localSize + offs];
2104 std::ostringstream msg;
2105 msg << "Comparison failed for pixel " << pixelNdx;
2106 return tcu::TestStatus::fail(msg.str());
2109 return tcu::TestStatus::pass("Compute succeeded");
2112 class ImageBarrierTest : public vkt::TestCase
2115 ImageBarrierTest (tcu::TestContext& testCtx,
2116 const std::string& name,
2117 const std::string& description,
2118 const tcu::IVec2& imageSize);
2120 void initPrograms (SourceCollections& sourceCollections) const;
2121 TestInstance* createInstance (Context& context) const;
2124 const tcu::IVec2 m_imageSize;
2127 class ImageBarrierTestInstance : public vkt::TestInstance
2130 ImageBarrierTestInstance (Context& context,
2131 const tcu::IVec2& imageSize);
2133 tcu::TestStatus iterate (void);
2136 const tcu::IVec2 m_imageSize;
2139 ImageBarrierTest::ImageBarrierTest (tcu::TestContext& testCtx,
2140 const std::string& name,
2141 const std::string& description,
2142 const tcu::IVec2& imageSize)
2143 : TestCase (testCtx, name, description)
2144 , m_imageSize (imageSize)
2148 void ImageBarrierTest::initPrograms (SourceCollections& sourceCollections) const
2150 sourceCollections.glslSources.add("comp0") << glu::ComputeSource(
2152 "layout (local_size_x = 1) in;\n"
2153 "layout(binding = 2) readonly uniform Constants {\n"
2154 " uint u_baseVal;\n"
2156 "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2157 "void main (void) {\n"
2158 " uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2159 " imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2162 sourceCollections.glslSources.add("comp1") << glu::ComputeSource(
2164 "layout (local_size_x = 1) in;\n"
2165 "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2166 "layout(binding = 0) coherent buffer Output {\n"
2169 "void main (void) {\n"
2170 " uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2171 " atomicAdd(sum, value);\n"
2175 TestInstance* ImageBarrierTest::createInstance (Context& context) const
2177 return new ImageBarrierTestInstance(context, m_imageSize);
2180 ImageBarrierTestInstance::ImageBarrierTestInstance (Context& context, const tcu::IVec2& imageSize)
2181 : TestInstance (context)
2182 , m_imageSize (imageSize)
2186 tcu::TestStatus ImageBarrierTestInstance::iterate (void)
2188 const DeviceInterface& vk = m_context.getDeviceInterface();
2189 const VkDevice device = m_context.getDevice();
2190 const VkQueue queue = m_context.getUniversalQueue();
2191 const deUint32 queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2192 Allocator& allocator = m_context.getDefaultAllocator();
2194 // Create an image used by both shaders
2196 const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2197 const Image image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2199 const VkImageSubresourceRange subresourceRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2200 const Unique<VkImageView> imageView(makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2202 // Create an output buffer
2204 const VkDeviceSize outputBufferSizeBytes = sizeof(deUint32);
2205 const Buffer outputBuffer(vk, device, allocator, makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2207 // Initialize atomic counter value to zero
2209 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2210 deUint32* outputBufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2211 *outputBufferPtr = 0;
2212 flushAlloc(vk, device, outputBufferAllocation);
2215 // Create a uniform buffer (to pass uniform constants)
2217 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32);
2218 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2220 // Set the constants in the uniform buffer
2222 const deUint32 baseValue = 127;
2224 const Allocation& uniformBufferAllocation = uniformBuffer.getAllocation();
2225 deUint32* uniformBufferPtr = static_cast<deUint32*>(uniformBufferAllocation.getHostPtr());
2226 uniformBufferPtr[0] = baseValue;
2228 flushAlloc(vk, device, uniformBufferAllocation);
2231 // Create descriptor set
2233 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2234 DescriptorSetLayoutBuilder()
2235 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2236 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2237 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2238 .build(vk, device));
2240 const Unique<VkDescriptorPool> descriptorPool(
2241 DescriptorPoolBuilder()
2242 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2243 .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2244 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2245 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2247 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2249 const VkDescriptorImageInfo imageDescriptorInfo = makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2250 const VkDescriptorBufferInfo outputBufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2251 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2252 DescriptorSetUpdateBuilder()
2253 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2254 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2255 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2256 .update(vk, device);
2258 // Perform the computation
2260 const Unique<VkShaderModule> shaderModule0(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp0"), 0));
2261 const Unique<VkShaderModule> shaderModule1(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp1"), 0));
2263 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2264 const Unique<VkPipeline> pipeline0(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule0));
2265 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule1));
2267 const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2269 const VkImageMemoryBarrier imageLayoutBarrier = makeImageMemoryBarrier(
2271 VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
2272 *image, subresourceRange);
2274 const VkImageMemoryBarrier imageBarrierBetweenShaders = makeImageMemoryBarrier(
2275 VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
2276 VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL,
2277 *image, subresourceRange);
2279 const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2281 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2282 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2284 // Start recording commands
2286 beginCommandBuffer(vk, *cmdBuffer);
2288 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline0);
2289 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2291 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &writeUniformConstantsBarrier, 1, &imageLayoutBarrier);
2293 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2294 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 0, (const VkBufferMemoryBarrier*)DE_NULL, 1, &imageBarrierBetweenShaders);
2296 // Switch to the second shader program
2297 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
2299 vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2300 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &afterComputeBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2302 endCommandBuffer(vk, *cmdBuffer);
2304 // Wait for completion
2306 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2308 // Validate the results
2310 const Allocation& outputBufferAllocation = outputBuffer.getAllocation();
2311 invalidateAlloc(vk, device, outputBufferAllocation);
2313 const int numValues = multiplyComponents(m_imageSize);
2314 const deUint32* bufferPtr = static_cast<deUint32*>(outputBufferAllocation.getHostPtr());
2315 const deUint32 res = *bufferPtr;
2318 for (int ndx = 0; ndx < numValues; ++ndx)
2319 ref += baseValue + ndx;
2323 std::ostringstream msg;
2324 msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2325 return tcu::TestStatus::fail(msg.str());
2327 return tcu::TestStatus::pass("Compute succeeded");
2330 class ComputeTestInstance : public vkt::TestInstance
2333 ComputeTestInstance (Context& context)
2334 : TestInstance (context)
2335 , m_numPhysDevices (1)
2336 , m_queueFamilyIndex (0)
2338 createDeviceGroup();
2341 ~ComputeTestInstance ()
2345 void createDeviceGroup (void);
2346 const vk::DeviceInterface& getDeviceInterface (void) { return *m_deviceDriver; }
2347 vk::VkInstance getInstance (void) { return m_deviceGroupInstance; }
2348 vk::VkDevice getDevice (void) { return *m_logicalDevice; }
2349 vk::VkPhysicalDevice getPhysicalDevice (deUint32 i = 0){ return m_physicalDevices[i]; }
2352 deUint32 m_numPhysDevices;
2353 deUint32 m_queueFamilyIndex;
2356 CustomInstance m_deviceGroupInstance;
2357 vk::Move<vk::VkDevice> m_logicalDevice;
2358 std::vector<vk::VkPhysicalDevice> m_physicalDevices;
2359 #ifndef CTS_USES_VULKANSC
2360 de::MovePtr<vk::DeviceDriver> m_deviceDriver;
2362 de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> m_deviceDriver;
2363 #endif // CTS_USES_VULKANSC
2366 void ComputeTestInstance::createDeviceGroup (void)
2368 const tcu::CommandLine& cmdLine = m_context.getTestContext().getCommandLine();
2369 const deUint32 devGroupIdx = cmdLine.getVKDeviceGroupId() - 1;
2370 const deUint32 physDeviceIdx = cmdLine.getVKDeviceId() - 1;
2371 const float queuePriority = 1.0f;
2372 const std::vector<std::string> requiredExtensions (1, "VK_KHR_device_group_creation");
2373 m_deviceGroupInstance = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2374 std::vector<VkPhysicalDeviceGroupProperties> devGroupProperties = enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2375 m_numPhysDevices = devGroupProperties[devGroupIdx].physicalDeviceCount;
2376 std::vector<const char*> deviceExtensions;
2378 if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2379 deviceExtensions.push_back("VK_KHR_device_group");
2381 VkDeviceGroupDeviceCreateInfo deviceGroupInfo =
2383 VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO, //stype
2385 devGroupProperties[devGroupIdx].physicalDeviceCount, //physicalDeviceCount
2386 devGroupProperties[devGroupIdx].physicalDevices //physicalDevices
2388 const InstanceDriver& instance (m_deviceGroupInstance.getDriver());
2389 const VkPhysicalDeviceFeatures deviceFeatures = getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2390 const std::vector<VkQueueFamilyProperties> queueProps = getPhysicalDeviceQueueFamilyProperties(instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2392 m_physicalDevices.resize(m_numPhysDevices);
2393 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2394 m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2396 for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2398 if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2399 m_queueFamilyIndex = (deUint32)queueNdx;
2402 VkDeviceQueueCreateInfo queueInfo =
2404 VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
2405 DE_NULL, // const void* pNext;
2406 (VkDeviceQueueCreateFlags)0u, // VkDeviceQueueCreateFlags flags;
2407 m_queueFamilyIndex, // deUint32 queueFamilyIndex;
2408 1u, // deUint32 queueCount;
2409 &queuePriority // const float* pQueuePriorities;
2412 void* pNext = &deviceGroupInfo;
2413 #ifdef CTS_USES_VULKANSC
2414 VkDeviceObjectReservationCreateInfo memReservationInfo = cmdLine.isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
2415 memReservationInfo.pNext = pNext;
2416 pNext = &memReservationInfo;
2418 VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
2419 sc10Features.pNext = pNext;
2420 pNext = &sc10Features;
2421 VkPipelineCacheCreateInfo pcCI;
2422 std::vector<VkPipelinePoolSize> poolSizes;
2423 if (cmdLine.isSubProcess())
2425 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2429 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
2430 DE_NULL, // const void* pNext;
2431 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2432 VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
2433 m_context.getResourceInterface()->getCacheDataSize(), // deUintptr initialDataSize;
2434 m_context.getResourceInterface()->getCacheData() // const void* pInitialData;
2436 memReservationInfo.pipelineCacheCreateInfoCount = 1;
2437 memReservationInfo.pPipelineCacheCreateInfos = &pcCI;
2440 poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
2441 if (!poolSizes.empty())
2443 memReservationInfo.pipelinePoolSizeCount = deUint32(poolSizes.size());
2444 memReservationInfo.pPipelinePoolSizes = poolSizes.data();
2448 #endif // CTS_USES_VULKANSC
2450 const VkDeviceCreateInfo deviceInfo =
2452 VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
2453 pNext, // const void* pNext;
2454 (VkDeviceCreateFlags)0, // VkDeviceCreateFlags flags;
2455 1u , // uint32_t queueCreateInfoCount;
2456 &queueInfo, // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
2457 0u, // uint32_t enabledLayerCount;
2458 DE_NULL, // const char* const* ppEnabledLayerNames;
2459 deUint32(deviceExtensions.size()), // uint32_t enabledExtensionCount;
2460 (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]), // const char* const* ppEnabledExtensionNames;
2461 &deviceFeatures, // const VkPhysicalDeviceFeatures* pEnabledFeatures;
2464 m_logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), m_deviceGroupInstance, instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2465 #ifndef CTS_USES_VULKANSC
2466 m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance, *m_logicalDevice));
2468 m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2469 #endif // CTS_USES_VULKANSC
2472 class DispatchBaseTest : public vkt::TestCase
2475 DispatchBaseTest (tcu::TestContext& testCtx,
2476 const std::string& name,
2477 const std::string& description,
2478 const deUint32 numValues,
2479 const tcu::IVec3& localsize,
2480 const tcu::IVec3& worksize,
2481 const tcu::IVec3& splitsize);
2483 void initPrograms (SourceCollections& sourceCollections) const;
2484 TestInstance* createInstance (Context& context) const;
2487 const deUint32 m_numValues;
2488 const tcu::IVec3 m_localSize;
2489 const tcu::IVec3 m_workSize;
2490 const tcu::IVec3 m_splitSize;
2493 class DispatchBaseTestInstance : public ComputeTestInstance
2496 DispatchBaseTestInstance (Context& context,
2497 const deUint32 numValues,
2498 const tcu::IVec3& localsize,
2499 const tcu::IVec3& worksize,
2500 const tcu::IVec3& splitsize);
2502 bool isInputVectorValid (const tcu::IVec3& small, const tcu::IVec3& big);
2503 tcu::TestStatus iterate (void);
2506 const deUint32 m_numValues;
2507 const tcu::IVec3 m_localSize;
2508 const tcu::IVec3 m_workSize;
2509 const tcu::IVec3 m_splitWorkSize;
2512 DispatchBaseTest::DispatchBaseTest (tcu::TestContext& testCtx,
2513 const std::string& name,
2514 const std::string& description,
2515 const deUint32 numValues,
2516 const tcu::IVec3& localsize,
2517 const tcu::IVec3& worksize,
2518 const tcu::IVec3& splitsize)
2519 : TestCase (testCtx, name, description)
2520 , m_numValues (numValues)
2521 , m_localSize (localsize)
2522 , m_workSize (worksize)
2523 , m_splitSize (splitsize)
2527 void DispatchBaseTest::initPrograms (SourceCollections& sourceCollections) const
2529 std::ostringstream src;
2530 src << "#version 310 es\n"
2531 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2533 << "layout(binding = 0) buffer InOut {\n"
2534 << " uint values[" << de::toString(m_numValues) << "];\n"
2537 << "layout(binding = 1) readonly uniform uniformInput {\n"
2538 << " uvec3 gridSize;\n"
2541 << "void main (void) {\n"
2542 << " uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2543 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2544 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2545 << " uint offset = numValuesPerInv*index;\n"
2546 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2547 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2550 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2553 TestInstance* DispatchBaseTest::createInstance (Context& context) const
2555 return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize);
2558 DispatchBaseTestInstance::DispatchBaseTestInstance (Context& context,
2559 const deUint32 numValues,
2560 const tcu::IVec3& localsize,
2561 const tcu::IVec3& worksize,
2562 const tcu::IVec3& splitsize)
2564 : ComputeTestInstance (context)
2565 , m_numValues (numValues)
2566 , m_localSize (localsize)
2567 , m_workSize (worksize)
2568 , m_splitWorkSize (splitsize)
2570 // For easy work distribution across physical devices:
2571 // WorkSize should be a multiple of SplitWorkSize only in the X component
2572 if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) ||
2573 (m_workSize.x() <= m_splitWorkSize.x()) ||
2574 (m_workSize.y() != m_splitWorkSize.y()) ||
2575 (m_workSize.z() != m_splitWorkSize.z()))
2576 TCU_THROW(TestError, "Invalid Input.");
2578 // For easy work distribution within the same physical device:
2579 // SplitWorkSize should be a multiple of localSize in Y or Z component
2580 if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) ||
2581 (m_localSize.x() != m_splitWorkSize.x()) ||
2582 (m_localSize.y() >= m_splitWorkSize.y()) ||
2583 (m_localSize.z() >= m_splitWorkSize.z()))
2584 TCU_THROW(TestError, "Invalid Input.");
2586 if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (deInt32) m_numPhysDevices)
2587 TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2589 deUint32 totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2590 if ((totalWork > numValues) || (numValues % totalWork != 0))
2591 TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2594 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3& small, const tcu::IVec3& big)
2596 if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2597 ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2602 tcu::TestStatus DispatchBaseTestInstance::iterate (void)
2604 const DeviceInterface& vk = getDeviceInterface();
2605 const VkDevice device = getDevice();
2606 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2607 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2608 deUint32 totalWorkloadSize = 0;
2610 // Create an uniform and input/output buffer
2611 const deUint32 uniformBufSize = 3; // Pass the compute grid size
2612 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2613 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2615 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2616 const Buffer buffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
2618 // Fill the buffers with data
2619 typedef std::vector<deUint32> data_vector_t;
2620 data_vector_t uniformInputData(uniformBufSize);
2621 data_vector_t inputData(m_numValues);
2624 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2625 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2626 uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2627 uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2628 uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2629 flushAlloc(vk, device, bufferAllocation);
2633 de::Random rnd(0x82ce7f);
2634 const Allocation& bufferAllocation = buffer.getAllocation();
2635 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2636 for (deUint32 i = 0; i < m_numValues; ++i)
2637 inputData[i] = *bufferPtr++ = rnd.getUint32();
2639 flushAlloc(vk, device, bufferAllocation);
2642 // Create descriptor set
2643 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2644 DescriptorSetLayoutBuilder()
2645 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2646 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2647 .build(vk, device));
2649 const Unique<VkDescriptorPool> descriptorPool(
2650 DescriptorPoolBuilder()
2651 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2652 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2653 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2655 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2657 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2658 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2660 DescriptorSetUpdateBuilder()
2661 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2662 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2663 .update(vk, device);
2665 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2666 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2667 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, static_cast<VkPipelineCreateFlags>(VK_PIPELINE_CREATE_DISPATCH_BASE), *shaderModule, static_cast<VkPipelineShaderStageCreateFlags>(0u)));
2669 const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2670 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2672 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2674 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2675 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2677 // Start recording commands
2678 beginCommandBuffer(vk, *cmdBuffer);
2680 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2681 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2683 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2685 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2687 // Split the workload across all physical devices based on m_splitWorkSize.x()
2688 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2690 deUint32 baseGroupX = physDevIdx * m_splitWorkSize.x();
2691 deUint32 baseGroupY = 0;
2692 deUint32 baseGroupZ = 0;
2694 // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
2695 for (deInt32 localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
2697 for (deInt32 localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
2699 deUint32 offsetX = baseGroupX;
2700 deUint32 offsetY = baseGroupY + localIdxY * m_localSize.y();
2701 deUint32 offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
2703 deUint32 localSizeX = (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
2704 deUint32 localSizeY = m_localSize.y();
2705 deUint32 localSizeZ = m_localSize.z();
2707 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
2708 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
2713 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2715 endCommandBuffer(vk, *cmdBuffer);
2716 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2718 if (totalWorkloadSize != deUint32(multiplyComponents(m_workSize)))
2719 TCU_THROW(TestError, "Not covering the entire workload.");
2721 // Validate the results
2722 const Allocation& bufferAllocation = buffer.getAllocation();
2723 invalidateAlloc(vk, device, bufferAllocation);
2724 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2726 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2728 const deUint32 res = bufferPtr[ndx];
2729 const deUint32 ref = ~inputData[ndx];
2733 std::ostringstream msg;
2734 msg << "Comparison failed for InOut.values[" << ndx << "]";
2735 return tcu::TestStatus::fail(msg.str());
2738 return tcu::TestStatus::pass("Compute succeeded");
2741 class DeviceIndexTest : public vkt::TestCase
2744 DeviceIndexTest (tcu::TestContext& testCtx,
2745 const std::string& name,
2746 const std::string& description,
2747 const deUint32 numValues,
2748 const tcu::IVec3& localsize,
2749 const tcu::IVec3& splitsize);
2751 void initPrograms (SourceCollections& sourceCollections) const;
2752 TestInstance* createInstance (Context& context) const;
2755 const deUint32 m_numValues;
2756 const tcu::IVec3 m_localSize;
2757 const tcu::IVec3 m_workSize;
2758 const tcu::IVec3 m_splitSize;
2761 class DeviceIndexTestInstance : public ComputeTestInstance
2764 DeviceIndexTestInstance (Context& context,
2765 const deUint32 numValues,
2766 const tcu::IVec3& localsize,
2767 const tcu::IVec3& worksize);
2768 tcu::TestStatus iterate (void);
2770 const deUint32 m_numValues;
2771 const tcu::IVec3 m_localSize;
2772 tcu::IVec3 m_workSize;
2775 DeviceIndexTest::DeviceIndexTest (tcu::TestContext& testCtx,
2776 const std::string& name,
2777 const std::string& description,
2778 const deUint32 numValues,
2779 const tcu::IVec3& localsize,
2780 const tcu::IVec3& worksize)
2781 : TestCase (testCtx, name, description)
2782 , m_numValues (numValues)
2783 , m_localSize (localsize)
2784 , m_workSize (worksize)
2788 void DeviceIndexTest::initPrograms (SourceCollections& sourceCollections) const
2790 std::ostringstream src;
2791 src << "#version 310 es\n"
2792 << "#extension GL_EXT_device_group : require\n"
2793 << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ", local_size_z = " << m_localSize.z() << ") in;\n"
2795 << "layout(binding = 0) buffer InOut {\n"
2796 << " uint values[" << de::toString(m_numValues) << "];\n"
2799 << "layout(binding = 1) readonly uniform uniformInput {\n"
2800 << " uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
2803 << "void main (void) {\n"
2804 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
2805 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2806 << " uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
2807 << " uint offset = numValuesPerInv*index;\n"
2808 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2809 << " sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
2812 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2815 TestInstance* DeviceIndexTest::createInstance (Context& context) const
2817 return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize);
2820 DeviceIndexTestInstance::DeviceIndexTestInstance (Context& context,
2821 const deUint32 numValues,
2822 const tcu::IVec3& localsize,
2823 const tcu::IVec3& worksize)
2825 : ComputeTestInstance (context)
2826 , m_numValues (numValues)
2827 , m_localSize (localsize)
2828 , m_workSize (worksize)
2831 tcu::TestStatus DeviceIndexTestInstance::iterate (void)
2833 const DeviceInterface& vk = getDeviceInterface();
2834 const VkDevice device = getDevice();
2835 const VkQueue queue = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2836 SimpleAllocator allocator (vk, device, getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2837 const deUint32 allocDeviceMask = (1 << m_numPhysDevices) - 1;
2838 de::Random rnd (0x82ce7f);
2839 Move<VkBuffer> sboBuffer;
2840 vk::Move<vk::VkDeviceMemory> sboBufferMemory;
2842 // Create an uniform and output buffer
2843 const deUint32 uniformBufSize = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
2844 const VkDeviceSize uniformBufferSizeBytes = sizeof(deUint32) * uniformBufSize;
2845 const Buffer uniformBuffer(vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT), MemoryRequirement::HostVisible);
2847 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * m_numValues;
2848 const Buffer checkBuffer(vk, device, allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT), MemoryRequirement::HostVisible);
2850 // create SBO buffer
2852 const VkBufferCreateInfo sboBufferParams =
2854 VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
2857 (VkDeviceSize)bufferSizeBytes, // size
2858 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // usage
2859 VK_SHARING_MODE_EXCLUSIVE, // sharingMode
2860 1u, // queueFamilyIndexCount
2861 &m_queueFamilyIndex, // pQueueFamilyIndices
2863 sboBuffer = createBuffer(vk, device, &sboBufferParams);
2865 VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
2866 deUint32 memoryTypeNdx = 0;
2867 const VkPhysicalDeviceMemoryProperties deviceMemProps = getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
2868 for ( memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
2870 if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
2871 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
2874 if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
2875 TCU_THROW(NotSupportedError, "No compatible memory type found");
2877 const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo =
2879 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, // sType
2881 VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT, // flags
2882 allocDeviceMask, // deviceMask
2885 VkMemoryAllocateInfo allocInfo =
2887 VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, // sType
2888 &allocDeviceMaskInfo, // pNext
2889 memReqs.size, // allocationSize
2890 memoryTypeNdx, // memoryTypeIndex
2893 sboBufferMemory = allocateMemory(vk, device, &allocInfo);
2894 VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
2897 // Fill the buffers with data
2898 typedef std::vector<deUint32> data_vector_t;
2899 data_vector_t uniformInputData(uniformBufSize, 0);
2902 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2903 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2904 for (deUint32 i = 0; i < uniformBufSize; ++i)
2905 uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
2907 flushAlloc(vk, device, bufferAllocation);
2910 // Create descriptor set
2911 const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2912 DescriptorSetLayoutBuilder()
2913 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2914 .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2915 .build(vk, device));
2917 const Unique<VkDescriptorPool> descriptorPool(
2918 DescriptorPoolBuilder()
2919 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2920 .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2921 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2923 const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2925 const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
2926 const VkDescriptorBufferInfo uniformBufferDescriptorInfo = makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2928 DescriptorSetUpdateBuilder()
2929 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2930 .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u), VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2931 .update(vk, device);
2933 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, m_context.getBinaryCollection().get("comp"), 0u));
2934 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device, *descriptorSetLayout));
2935 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
2937 const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2938 const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2940 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2941 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2943 // Verify multiple device masks
2944 for (deUint32 physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
2946 deUint32 constantValPerLoop = 0;
2948 const Allocation& bufferAllocation = uniformBuffer.getAllocation();
2949 deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2950 constantValPerLoop = *bufferPtr = rnd.getUint32() / 10; // divide to prevent overflow in addition
2951 flushAlloc(vk, device, bufferAllocation);
2953 beginCommandBuffer(vk, *cmdBuffer);
2955 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
2956 vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout, 0u, 1u, &descriptorSet.get(), 0u, DE_NULL);
2957 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostUniformWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2959 vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
2960 vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2962 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2964 endCommandBuffer(vk, *cmdBuffer);
2965 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
2967 // Validate the results on all physical devices where compute shader was launched
2968 const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT , *sboBuffer, 0ull, bufferSizeBytes);
2969 const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
2970 const VkBufferCopy copyParams =
2972 (VkDeviceSize)0u, // srcOffset
2973 (VkDeviceSize)0u, // dstOffset
2974 bufferSizeBytes // size
2977 for (deUint32 physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2979 if (!(1<<physDevIdx & physDevMask))
2982 const deUint32 deviceMask = 1 << physDevIdx;
2984 beginCommandBuffer(vk, *cmdBuffer);
2985 vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
2986 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT , VK_PIPELINE_STAGE_TRANSFER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &srcBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2987 vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, ©Params);
2988 vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &dstBufferBarrier, 0, (const VkImageMemoryBarrier*)DE_NULL);
2990 endCommandBuffer(vk, *cmdBuffer);
2991 submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
2993 const Allocation& bufferAllocation = checkBuffer.getAllocation();
2994 invalidateAlloc(vk, device, bufferAllocation);
2995 const deUint32* bufferPtr = static_cast<deUint32*>(bufferAllocation.getHostPtr());
2997 for (deUint32 ndx = 0; ndx < m_numValues; ++ndx)
2999 const deUint32 res = bufferPtr[ndx];
3000 const deUint32 ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
3004 std::ostringstream msg;
3005 msg << "Comparison failed on physical device "<< getPhysicalDevice(physDevIdx) <<" ( deviceMask "<< deviceMask <<" ) for InOut.values[" << ndx << "]";
3006 return tcu::TestStatus::fail(msg.str());
3012 return tcu::TestStatus::pass("Compute succeeded");
3015 class ConcurrentCompute : public vkt::TestCase
3018 ConcurrentCompute (tcu::TestContext& testCtx,
3019 const std::string& name,
3020 const std::string& description);
3023 void initPrograms (SourceCollections& sourceCollections) const;
3024 TestInstance* createInstance (Context& context) const;
3027 class ConcurrentComputeInstance : public vkt::TestInstance
3030 ConcurrentComputeInstance (Context& context);
3032 tcu::TestStatus iterate (void);
3035 ConcurrentCompute::ConcurrentCompute (tcu::TestContext& testCtx,
3036 const std::string& name,
3037 const std::string& description)
3038 : TestCase (testCtx, name, description)
3042 void ConcurrentCompute::initPrograms (SourceCollections& sourceCollections) const
3044 std::ostringstream src;
3045 src << "#version 310 es\n"
3046 << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3047 << "layout(binding = 0) buffer InOut {\n"
3048 << " uint values[1024];\n"
3050 << "void main (void) {\n"
3051 << " uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3052 << " uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3053 << " uint groupNdx = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + gl_GlobalInvocationID.x;\n"
3054 << " uint offset = numValuesPerInv*groupNdx;\n"
3056 << " for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3057 << " sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3060 sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3063 TestInstance* ConcurrentCompute::createInstance (Context& context) const
3065 return new ConcurrentComputeInstance(context);
3068 ConcurrentComputeInstance::ConcurrentComputeInstance (Context& context)
3069 : TestInstance (context)
3073 tcu::TestStatus ConcurrentComputeInstance::iterate (void)
3076 NO_MATCH_FOUND = ~((deUint32)0),
3085 deUint32 queueFamilyIndex;
3088 // const DeviceInterface& vk = m_context.getDeviceInterface();
3089 const deUint32 numValues = 1024;
3090 const CustomInstance instance (createCustomInstanceFromContext(m_context));
3091 const InstanceDriver& instanceDriver (instance.getDriver());
3092 const VkPhysicalDevice physicalDevice = chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3093 tcu::TestLog& log = m_context.getTestContext().getLog();
3094 vk::Move<vk::VkDevice> logicalDevice;
3095 std::vector<VkQueueFamilyProperties> queueFamilyProperties;
3096 VkDeviceCreateInfo deviceInfo;
3097 VkPhysicalDeviceFeatures deviceFeatures;
3098 const float queuePriorities[2] = {1.0f, 0.0f};
3099 VkDeviceQueueCreateInfo queueInfos[2];
3102 {DE_NULL, (deUint32)NO_MATCH_FOUND},
3103 {DE_NULL, (deUint32)NO_MATCH_FOUND}
3106 queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3108 for (deUint32 queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3110 if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3112 if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3113 queues[0].queueFamilyIndex = queueNdx;
3115 if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3117 queues[1].queueFamilyIndex = queueNdx;
3123 if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3124 TCU_THROW(NotSupportedError, "Queues couldn't be created");
3126 for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3128 VkDeviceQueueCreateInfo queueInfo;
3129 deMemset(&queueInfo, 0, sizeof(queueInfo));
3131 queueInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3132 queueInfo.pNext = DE_NULL;
3133 queueInfo.flags = (VkDeviceQueueCreateFlags)0u;
3134 queueInfo.queueFamilyIndex = queues[queueNdx].queueFamilyIndex;
3135 queueInfo.queueCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3136 queueInfo.pQueuePriorities = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3138 queueInfos[queueNdx] = queueInfo;
3140 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3144 void* pNext = DE_NULL;
3145 #ifdef CTS_USES_VULKANSC
3146 VkDeviceObjectReservationCreateInfo memReservationInfo = m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() : resetDeviceObjectReservationCreateInfo();
3147 memReservationInfo.pNext = pNext;
3148 pNext = &memReservationInfo;
3150 VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
3151 sc10Features.pNext = pNext;
3152 pNext = &sc10Features;
3154 VkPipelineCacheCreateInfo pcCI;
3155 std::vector<VkPipelinePoolSize> poolSizes;
3156 if (m_context.getTestContext().getCommandLine().isSubProcess())
3158 if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3162 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
3163 DE_NULL, // const void* pNext;
3164 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3165 VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
3166 m_context.getResourceInterface()->getCacheDataSize(), // deUintptr initialDataSize;
3167 m_context.getResourceInterface()->getCacheData() // const void* pInitialData;
3169 memReservationInfo.pipelineCacheCreateInfoCount = 1;
3170 memReservationInfo.pPipelineCacheCreateInfos = &pcCI;
3173 poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
3174 if (!poolSizes.empty())
3176 memReservationInfo.pipelinePoolSizeCount = deUint32(poolSizes.size());
3177 memReservationInfo.pPipelinePoolSizes = poolSizes.data();
3180 #endif // CTS_USES_VULKANSC
3182 deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3183 instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3185 deviceInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3186 deviceInfo.pNext = pNext;
3187 deviceInfo.enabledExtensionCount = 0u;
3188 deviceInfo.ppEnabledExtensionNames = DE_NULL;
3189 deviceInfo.enabledLayerCount = 0u;
3190 deviceInfo.ppEnabledLayerNames = DE_NULL;
3191 deviceInfo.pEnabledFeatures = &deviceFeatures;
3192 deviceInfo.queueCreateInfoCount = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3193 deviceInfo.pQueueCreateInfos = queueInfos;
3195 logicalDevice = createCustomDevice (m_context.getTestContext().getCommandLine().isValidationEnabled(), m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3197 #ifndef CTS_USES_VULKANSC
3198 de::MovePtr<vk::DeviceDriver> deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice));
3200 de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(), m_context.getDeviceVulkanSC10Properties()), vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3201 #endif // CTS_USES_VULKANSC
3202 vk::DeviceInterface& vk = *deviceDriver;
3204 for (deUint32 queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3206 if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3207 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx, &queues[queueReqNdx].queue);
3209 vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3212 // Create an input/output buffers
3213 const VkPhysicalDeviceMemoryProperties memoryProperties = vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3215 de::MovePtr<SimpleAllocator> allocator = de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3216 const VkDeviceSize bufferSizeBytes = sizeof(deUint32) * numValues;
3217 const Buffer buffer1(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3218 const Buffer buffer2(vk, *logicalDevice, *allocator, makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT), MemoryRequirement::HostVisible);
3220 // Fill the buffers with data
3222 typedef std::vector<deUint32> data_vector_t;
3223 data_vector_t inputData(numValues);
3226 de::Random rnd(0x82ce7f);
3227 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3228 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3229 deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3230 deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3232 for (deUint32 i = 0; i < numValues; ++i)
3234 deUint32 val = rnd.getUint32();
3236 *bufferPtr1++ = val;
3237 *bufferPtr2++ = val;
3240 flushAlloc(vk, *logicalDevice, bufferAllocation1);
3241 flushAlloc(vk, *logicalDevice, bufferAllocation2);
3244 // Create descriptor sets
3246 const Unique<VkDescriptorSetLayout> descriptorSetLayout1(
3247 DescriptorSetLayoutBuilder()
3248 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3249 .build(vk, *logicalDevice));
3251 const Unique<VkDescriptorPool> descriptorPool1(
3252 DescriptorPoolBuilder()
3253 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3254 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3256 const Unique<VkDescriptorSet> descriptorSet1(makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3258 const VkDescriptorBufferInfo bufferDescriptorInfo1 = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3259 DescriptorSetUpdateBuilder()
3260 .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3261 .update(vk, *logicalDevice);
3263 const Unique<VkDescriptorSetLayout> descriptorSetLayout2(
3264 DescriptorSetLayoutBuilder()
3265 .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3266 .build(vk, *logicalDevice));
3268 const Unique<VkDescriptorPool> descriptorPool2(
3269 DescriptorPoolBuilder()
3270 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3271 .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3273 const Unique<VkDescriptorSet> descriptorSet2(makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3275 const VkDescriptorBufferInfo bufferDescriptorInfo2 = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3276 DescriptorSetUpdateBuilder()
3277 .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3278 .update(vk, *logicalDevice);
3280 // Perform the computation
3282 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3284 const Unique<VkPipelineLayout> pipelineLayout1(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout1));
3285 const Unique<VkPipeline> pipeline1(makeComputePipeline(vk, *logicalDevice, *pipelineLayout1, *shaderModule));
3286 const VkBufferMemoryBarrier hostWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3287 const VkBufferMemoryBarrier shaderWriteBarrier1 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3288 const Unique<VkCommandPool> cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3289 const Unique<VkCommandBuffer> cmdBuffer1(allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3291 const Unique<VkPipelineLayout> pipelineLayout2(makePipelineLayout(vk, *logicalDevice, *descriptorSetLayout2));
3292 const Unique<VkPipeline> pipeline2(makeComputePipeline(vk, *logicalDevice, *pipelineLayout2, *shaderModule));
3293 const VkBufferMemoryBarrier hostWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3294 const VkBufferMemoryBarrier shaderWriteBarrier2 = makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3295 const Unique<VkCommandPool> cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3296 const Unique<VkCommandBuffer> cmdBuffer2(allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3300 beginCommandBuffer(vk, *cmdBuffer1);
3301 vk.cmdBindPipeline(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline1);
3302 vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout1, 0u, 1u, &descriptorSet1.get(), 0u, DE_NULL);
3303 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3304 vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3305 vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier1, 0, (const VkImageMemoryBarrier*)DE_NULL);
3306 endCommandBuffer(vk, *cmdBuffer1);
3310 beginCommandBuffer(vk, *cmdBuffer2);
3311 vk.cmdBindPipeline(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline2);
3312 vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, *pipelineLayout2, 0u, 1u, &descriptorSet2.get(), 0u, DE_NULL);
3313 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &hostWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3314 vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3315 vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, (VkDependencyFlags)0, 0, (const VkMemoryBarrier*)DE_NULL, 1, &shaderWriteBarrier2, 0, (const VkImageMemoryBarrier*)DE_NULL);
3316 endCommandBuffer(vk, *cmdBuffer2);
3318 VkSubmitInfo submitInfo1 =
3320 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3322 0u, // waitSemaphoreCount
3323 DE_NULL, // pWaitSemaphores
3324 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3325 1u, // commandBufferCount
3326 &cmdBuffer1.get(), // pCommandBuffers
3327 0u, // signalSemaphoreCount
3328 DE_NULL // pSignalSemaphores
3331 VkSubmitInfo submitInfo2 =
3333 VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
3335 0u, // waitSemaphoreCount
3336 DE_NULL, // pWaitSemaphores
3337 (const VkPipelineStageFlags*)DE_NULL, // pWaitDstStageMask
3338 1u, // commandBufferCount
3339 &cmdBuffer2.get(), // pCommandBuffers
3340 0u, // signalSemaphoreCount
3341 DE_NULL // pSignalSemaphores
3344 // Wait for completion
3345 const Unique<VkFence> fence1(createFence(vk, *logicalDevice));
3346 const Unique<VkFence> fence2(createFence(vk, *logicalDevice));
3348 VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3349 VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3351 int err = ERROR_NONE;
3353 // First wait for the low-priority queue
3354 if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), DE_TRUE, ~0ull))
3357 // If the high-priority queue hasn't finished, we have a problem.
3358 if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3359 if (err == ERROR_NONE)
3362 // Wait for the high-priority fence so we don't get errors on teardown.
3363 vk.waitForFences(*logicalDevice, 1u, &fence1.get(), DE_TRUE, ~0ull);
3365 // If we fail() before waiting for all of the fences, error will come from
3366 // teardown instead of the error we want.
3368 if (err == ERROR_WAIT)
3370 return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3373 // Validate the results
3375 const Allocation& bufferAllocation1 = buffer1.getAllocation();
3376 invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3377 const deUint32* bufferPtr1 = static_cast<deUint32*>(bufferAllocation1.getHostPtr());
3379 const Allocation& bufferAllocation2 = buffer2.getAllocation();
3380 invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3381 const deUint32* bufferPtr2 = static_cast<deUint32*>(bufferAllocation2.getHostPtr());
3383 for (deUint32 ndx = 0; ndx < numValues; ++ndx)
3385 const deUint32 res1 = bufferPtr1[ndx];
3386 const deUint32 res2 = bufferPtr2[ndx];
3387 const deUint32 inp = inputData[ndx];
3388 const deUint32 ref = ~inp;
3390 if (res1 != ref || res1 != res2)
3392 std::ostringstream msg;
3393 msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref <<" res1:" << res1 << " res2:" << res2 << " inp:" << inp;
3394 return tcu::TestStatus::fail(msg.str());
3398 if (err == ERROR_ORDER)
3399 log << tcu::TestLog::Message << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may be inverted." << tcu::TestLog::EndMessage;
3401 return tcu::TestStatus::pass("Test passed");
3404 class EmptyWorkGroupCase : public vkt::TestCase
3407 EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize);
3408 virtual ~EmptyWorkGroupCase (void) {}
3410 TestInstance* createInstance (Context& context) const override;
3411 void initPrograms (vk::SourceCollections& programCollection) const override;
3414 const tcu::UVec3 m_dispatchSize;
3417 class EmptyWorkGroupInstance : public vkt::TestInstance
3420 EmptyWorkGroupInstance (Context& context, const tcu::UVec3& dispatchSize)
3421 : vkt::TestInstance (context)
3422 , m_dispatchSize (dispatchSize)
3424 virtual ~EmptyWorkGroupInstance (void) {}
3426 tcu::TestStatus iterate (void) override;
3429 const tcu::UVec3 m_dispatchSize;
3432 EmptyWorkGroupCase::EmptyWorkGroupCase (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const tcu::UVec3& dispatchSize)
3433 : vkt::TestCase (testCtx, name, description)
3434 , m_dispatchSize (dispatchSize)
3436 DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3439 TestInstance* EmptyWorkGroupCase::createInstance (Context& context) const
3441 return new EmptyWorkGroupInstance(context, m_dispatchSize);
3444 void EmptyWorkGroupCase::initPrograms (vk::SourceCollections& programCollection) const
3446 std::ostringstream comp;
3449 << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3450 << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3451 << "void main () { atomicAdd(verif.value, 1u); }\n"
3453 programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3456 tcu::TestStatus EmptyWorkGroupInstance::iterate (void)
3458 const auto& vkd = m_context.getDeviceInterface();
3459 const auto device = m_context.getDevice();
3460 auto& alloc = m_context.getDefaultAllocator();
3461 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3462 const auto queue = m_context.getUniversalQueue();
3464 const auto verifBufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
3465 const auto verifBufferInfo = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3466 BufferWithMemory verifBuffer (vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3467 auto& verifBufferAlloc = verifBuffer.getAllocation();
3468 void* verifBufferPtr = verifBufferAlloc.getHostPtr();
3470 deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3471 flushAlloc(vkd, device, verifBufferAlloc);
3473 DescriptorSetLayoutBuilder layoutBuilder;
3474 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3475 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3477 const auto pipelineLayout = makePipelineLayout(vkd, device, descriptorSetLayout.get());
3478 const auto shaderModule = createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3479 const auto pipeline = makeComputePipeline(vkd, device, pipelineLayout.get(), shaderModule.get());
3481 DescriptorPoolBuilder poolBuilder;
3482 poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3483 const auto descriptorPool = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3484 const auto descriptorSet = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3486 DescriptorSetUpdateBuilder updateBuilder;
3487 const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3488 updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3489 updateBuilder.update(vkd, device);
3491 const auto cmdPool = makeCommandPool(vkd, device, queueIndex);
3492 const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3493 const auto cmdBuffer = cmdBufferPtr.get();
3495 beginCommandBuffer(vkd, cmdBuffer);
3496 vkd.cmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.get());
3497 vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3498 vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3500 const auto readWriteAccess = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3501 const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3502 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U, 1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3504 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3506 const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3507 vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u, &computeToHost, 0u, nullptr, 0u, nullptr);
3509 endCommandBuffer(vkd, cmdBuffer);
3510 submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3513 invalidateAlloc(vkd, device, verifBufferAlloc);
3514 deMemcpy(&value, verifBufferPtr, sizeof(value));
3518 std::ostringstream msg;
3519 msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3520 TCU_FAIL(msg.str());
3523 return tcu::TestStatus::pass("Pass");
3526 class MaxWorkGroupSizeTest : public vkt::TestCase
3529 enum class Axis { X = 0, Y = 1, Z = 2 };
3533 // Which axis to maximize.
3537 MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params);
3538 virtual ~MaxWorkGroupSizeTest (void) {}
3540 virtual void initPrograms (vk::SourceCollections& programCollection) const;
3541 virtual TestInstance* createInstance (Context& context) const;
3542 virtual void checkSupport (Context& context) const;
3544 // Helper to transform the axis value to an index.
3545 static int getIndex (Axis axis);
3547 // Helper returning the number of invocations according to the test parameters.
3548 static deUint32 getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties = nullptr);
3550 // Helper returning the buffer size needed to this test.
3551 static deUint32 getSSBOSize (deUint32 invocations);
3557 class MaxWorkGroupSizeInstance : public vkt::TestInstance
3560 MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params);
3561 virtual ~MaxWorkGroupSizeInstance (void) {}
3563 virtual tcu::TestStatus iterate (void);
3566 MaxWorkGroupSizeTest::Params m_params;
3569 int MaxWorkGroupSizeTest::getIndex (Axis axis)
3571 const int ret = static_cast<int>(axis);
3572 DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
3576 deUint32 MaxWorkGroupSizeTest::getInvocations (const Params& params, const vk::InstanceInterface& vki, vk::VkPhysicalDevice physicalDevice, const vk::VkPhysicalDeviceProperties* devProperties)
3578 const auto axis = getIndex(params.axis);
3581 return devProperties->limits.maxComputeWorkGroupSize[axis];
3582 return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
3585 deUint32 MaxWorkGroupSizeTest::getSSBOSize (deUint32 invocations)
3587 return invocations * static_cast<deUint32>(sizeof(deUint32));
3590 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest (tcu::TestContext& testCtx, const std::string& name, const std::string& description, const Params& params)
3591 : vkt::TestCase (testCtx, name, description)
3595 void MaxWorkGroupSizeTest::initPrograms (vk::SourceCollections& programCollection) const
3597 std::ostringstream shader;
3599 // The actual local sizes will be set using spec constants when running the test instance.
3603 << "layout(constant_id=0) const int local_size_x_val = 1;\n"
3604 << "layout(constant_id=1) const int local_size_y_val = 1;\n"
3605 << "layout(constant_id=2) const int local_size_z_val = 1;\n"
3607 << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
3609 << "layout(set=0, binding=0) buffer StorageBuffer {\n"
3610 << " uint values[];\n"
3613 << "void main() {\n"
3614 << " ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
3618 programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
3621 TestInstance* MaxWorkGroupSizeTest::createInstance (Context& context) const
3623 return new MaxWorkGroupSizeInstance(context, m_params);
3626 void MaxWorkGroupSizeTest::checkSupport (Context& context) const
3628 const auto& vki = context.getInstanceInterface();
3629 const auto physicalDevice = context.getPhysicalDevice();
3631 const auto properties = vk::getPhysicalDeviceProperties(vki, physicalDevice);
3632 const auto invocations = getInvocations(m_params, vki, physicalDevice, &properties);
3634 if (invocations > properties.limits.maxComputeWorkGroupInvocations)
3635 TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
3637 if (properties.limits.maxStorageBufferRange / static_cast<deUint32>(sizeof(deUint32)) < invocations)
3638 TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
3641 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance (Context& context, const MaxWorkGroupSizeTest::Params& params)
3642 : vkt::TestInstance (context)
3646 tcu::TestStatus MaxWorkGroupSizeInstance::iterate (void)
3648 const auto& vki = m_context.getInstanceInterface();
3649 const auto& vkd = m_context.getDeviceInterface();
3650 const auto physicalDevice = m_context.getPhysicalDevice();
3651 const auto device = m_context.getDevice();
3652 auto& alloc = m_context.getDefaultAllocator();
3653 const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3654 const auto queue = m_context.getUniversalQueue();
3655 auto& log = m_context.getTestContext().getLog();
3657 const auto axis = MaxWorkGroupSizeTest::getIndex(m_params.axis);
3658 const auto invocations = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
3659 const auto ssboSize = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
3662 << tcu::TestLog::Message
3663 << "Running test with " << invocations << " invocations on axis " << axis << " using a storage buffer size of " << ssboSize << " bytes"
3664 << tcu::TestLog::EndMessage
3667 // Main SSBO buffer.
3668 const auto ssboInfo = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3669 vk::BufferWithMemory ssbo (vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
3672 const auto shaderModule = vk::createShaderModule(vkd, device, m_context.getBinaryCollection().get("comp"), 0u);
3674 // Descriptor set layouts.
3675 vk::DescriptorSetLayoutBuilder layoutBuilder;
3676 layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
3677 const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3679 // Specialization constants: set the number of invocations in the appropriate local size id.
3680 const auto entrySize = static_cast<deUintptr>(sizeof(deInt32));
3681 deInt32 specializationData[3] = { 1, 1, 1 };
3682 specializationData[axis] = static_cast<deInt32>(invocations);
3684 const vk::VkSpecializationMapEntry specializationMaps[3] =
3687 0u, // deUint32 constantID;
3688 0u, // deUint32 offset;
3689 entrySize, // deUintptr size;
3692 1u, // deUint32 constantID;
3693 static_cast<deUint32>(entrySize), // deUint32 offset;
3694 entrySize, // deUintptr size;
3697 2u, // deUint32 constantID;
3698 static_cast<deUint32>(entrySize * 2u), // deUint32 offset;
3699 entrySize, // deUintptr size;
3703 const vk::VkSpecializationInfo specializationInfo =
3705 3u, // deUint32 mapEntryCount;
3706 specializationMaps, // const VkSpecializationMapEntry* pMapEntries;
3707 static_cast<deUintptr>(sizeof(specializationData)), // deUintptr dataSize;
3708 specializationData, // const void* pData;
3712 const vk::VkPipelineLayoutCreateInfo testPipelineLayoutInfo =
3714 vk::VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // VkStructureType sType;
3715 nullptr, // const void* pNext;
3716 0u, // VkPipelineLayoutCreateFlags flags;
3717 1u, // deUint32 setLayoutCount;
3718 &descriptorSetLayout.get(), // const VkDescriptorSetLayout* pSetLayouts;
3719 0u, // deUint32 pushConstantRangeCount;
3720 nullptr, // const VkPushConstantRange* pPushConstantRanges;
3722 const auto testPipelineLayout = vk::createPipelineLayout(vkd, device, &testPipelineLayoutInfo);
3724 const vk::VkComputePipelineCreateInfo testPipelineInfo =
3726 vk::VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // VkStructureType sType;
3727 nullptr, // const void* pNext;
3728 0u, // VkPipelineCreateFlags flags;
3729 { // VkPipelineShaderStageCreateInfo stage;
3730 vk::VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,// VkStructureType sType;
3731 nullptr, // const void* pNext;
3732 0u, // VkPipelineShaderStageCreateFlags flags;
3733 vk::VK_SHADER_STAGE_COMPUTE_BIT, // VkShaderStageFlagBits stage;
3734 shaderModule.get(), // VkShaderModule module;
3735 "main", // const char* pName;
3736 &specializationInfo, // const VkSpecializationInfo* pSpecializationInfo;
3738 testPipelineLayout.get(), // VkPipelineLayout layout;
3739 DE_NULL, // VkPipeline basePipelineHandle;
3740 0u, // deInt32 basePipelineIndex;
3742 const auto testPipeline = vk::createComputePipeline(vkd, device, DE_NULL, &testPipelineInfo);
3744 // Create descriptor pool and set.
3745 vk::DescriptorPoolBuilder poolBuilder;
3746 poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3747 const auto descriptorPool = poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3748 const auto descriptorSet = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3750 // Update descriptor set.
3751 const vk::VkDescriptorBufferInfo ssboBufferInfo =
3753 ssbo.get(), // VkBuffer buffer;
3754 0u, // VkDeviceSize offset;
3755 VK_WHOLE_SIZE, // VkDeviceSize range;
3758 vk::DescriptorSetUpdateBuilder updateBuilder;
3759 updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u), vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
3760 updateBuilder.update(vkd, device);
3763 auto& ssboAlloc = ssbo.getAllocation();
3764 void* ssboPtr = ssboAlloc.getHostPtr();
3765 deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
3766 vk::flushAlloc(vkd, device, ssboAlloc);
3769 const auto cmdPool = vk::makeCommandPool(vkd, device, queueIndex);
3770 const auto cmdBUfferPtr = vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3771 const auto cmdBuffer = cmdBUfferPtr.get();
3773 vk::beginCommandBuffer(vkd, cmdBuffer);
3775 // Run the main test shader.
3776 const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3777 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u, nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
3779 vkd.cmdBindPipeline(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.get());
3780 vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipelineLayout.get(), 0u, 1u, &descriptorSet.get(), 0u, nullptr);
3781 vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3783 const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
3784 vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u, nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
3786 vk::endCommandBuffer(vkd, cmdBuffer);
3787 vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3789 // Verify buffer contents.
3790 vk::invalidateAlloc(vkd, device, ssboAlloc);
3791 std::unique_ptr<deUint32[]> valuesArray (new deUint32[invocations]);
3792 deUint32* valuesPtr = valuesArray.get();
3793 deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
3795 std::string errorMsg;
3798 for (size_t i = 0; i < invocations; ++i)
3800 if (valuesPtr[i] != 1u)
3803 errorMsg = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " + de::toString(valuesPtr[i]);
3809 return tcu::TestStatus::fail(errorMsg);
3810 return tcu::TestStatus::pass("Pass");
3813 namespace EmptyShaderTest
3816 void createProgram (SourceCollections& dst)
3818 dst.glslSources.add("comp") << glu::ComputeSource(
3820 "layout (local_size_x = 1) in;\n"
3821 "void main (void) {}\n"
3825 tcu::TestStatus createTest (Context& context)
3827 const DeviceInterface& vk = context.getDeviceInterface();
3828 const VkDevice device = context.getDevice();
3829 const VkQueue queue = context.getUniversalQueue();
3830 const deUint32 queueFamilyIndex = context.getUniversalQueueFamilyIndex();
3832 const Unique<VkShaderModule> shaderModule(createShaderModule(vk, device, context.getBinaryCollection().get("comp"), 0u));
3834 const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vk, device));
3835 const Unique<VkPipeline> pipeline(makeComputePipeline(vk, device, *pipelineLayout, *shaderModule));
3837 const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
3838 const Unique<VkCommandBuffer> cmdBuffer(allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3840 // Start recording commands
3842 beginCommandBuffer(vk, *cmdBuffer);
3844 vk.cmdBindPipeline(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
3846 const tcu::IVec3 workGroups(1, 1, 1);
3847 vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
3849 endCommandBuffer(vk, *cmdBuffer);
3851 submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3853 return tcu::TestStatus::pass("Compute succeeded");
3856 } // EmptyShaderTest ns
3859 tcu::TestCaseGroup* createBasicComputeShaderTests (tcu::TestContext& testCtx)
3861 de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic", "Basic compute tests"));
3863 addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", "Shader that does nothing", EmptyShaderTest::createProgram, EmptyShaderTest::createTest);
3865 basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", "Concurrent compute test"));
3867 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", "Use an empty workgroup with size 0 on the X axis", tcu::UVec3(0u, 2u, 3u)));
3868 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", "Use an empty workgroup with size 0 on the Y axis", tcu::UVec3(2u, 0u, 3u)));
3869 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", "Use an empty workgroup with size 0 on the Z axis", tcu::UVec3(2u, 3u, 0u)));
3870 basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", "Use an empty workgroup with size 0 on the X, Y and Z axes", tcu::UVec3(0u, 0u, 0u)));
3872 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x", "Use the maximum work group size on the X axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X}));
3873 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y", "Use the maximum work group size on the Y axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y}));
3874 basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z", "Use the maximum work group size on the Z axis", MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z}));
3876 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_invocation", "Copy from UBO to SSBO, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3877 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_group", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(2,1,4), tcu::IVec3(1,1,1)));
3878 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_invocations", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3879 basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_groups", "Copy from UBO to SSBO, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3881 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_single_invocation", "Copy between SSBOs, inverting bits", 256, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3882 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_invocations", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,1,1), tcu::IVec3(2,4,1)));
3883 basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_groups", "Copy between SSBOs, inverting bits", 1024, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3885 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_single_invocation", "Read and write same SSBO", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3886 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_multiple_groups", "Read and write same SSBO", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3887 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_single_invocation", "Read and write same SSBO", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3888 basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_multiple_groups", "Read and write same SSBO", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3890 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_single_invocation", "Write to multiple SSBOs", 256, true, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3891 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_multiple_groups", "Write to multiple SSBOs", 1024, true, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3892 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_single_invocation", "Write to multiple SSBOs", 256, false, tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3893 basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_multiple_groups", "Write to multiple SSBOs", 1024, false, tcu::IVec3(1,4,2), tcu::IVec3(2,2,4)));
3895 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_invocation", "SSBO local barrier usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3896 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_group", "SSBO local barrier usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3897 basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_multiple_groups", "SSBO local barrier usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3899 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_single", "SSBO memory barrier usage", tcu::IVec3(1,1,1)));
3900 basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_multiple", "SSBO memory barrier usage", tcu::IVec3(11,5,7)));
3902 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_invocation", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3903 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_group", "Basic shared variable usage", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3904 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_invocations", "Basic shared variable usage", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3905 basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_groups", "Basic shared variable usage", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3907 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_invocation", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(1,1,1)));
3908 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_group", "Atomic operation with shared var", tcu::IVec3(3,2,5), tcu::IVec3(1,1,1)));
3909 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_invocations", "Atomic operation with shared var", tcu::IVec3(1,1,1), tcu::IVec3(2,5,4)));
3910 basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_groups", "Atomic operation with shared var", tcu::IVec3(3,4,1), tcu::IVec3(2,7,3)));
3912 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_small", "Image to SSBO copy", tcu::IVec2(1,1), tcu::IVec2(64,64)));
3913 basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_large", "Image to SSBO copy", tcu::IVec2(2,4), tcu::IVec2(512,512)));
3915 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_small", "SSBO to image copy", tcu::IVec2(1, 1), tcu::IVec2(64, 64)));
3916 basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_large", "SSBO to image copy", tcu::IVec2(2, 4), tcu::IVec2(512, 512)));
3918 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_1", "Atomic operation with image", 1, tcu::IVec2(64,64)));
3919 basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_8", "Atomic operation with image", 8, tcu::IVec2(64,64)));
3921 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_single", "Image barrier", tcu::IVec2(1,1)));
3922 basicComputeTests->addChild(new ImageBarrierTest(testCtx, "image_barrier_multiple", "Image barrier", tcu::IVec2(64,64)));
3924 #ifndef CTS_USES_VULKANSC
3925 basicComputeTests->addChild(cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
3928 return basicComputeTests.release();
3931 tcu::TestCaseGroup* createBasicDeviceGroupComputeShaderTests (tcu::TestContext& testCtx)
3933 de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group", "Basic device group compute tests"));
3935 deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base", "Compute shader with base groups", 32768, tcu::IVec3(4,2,4), tcu::IVec3(16,8,8), tcu::IVec3(4,8,8)));
3936 deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx, "device_index", "Compute shader using deviceIndex in SPIRV", 96, tcu::IVec3(3,2,1), tcu::IVec3(2,4,1)));
3938 return deviceGroupComputeTests.release();