Fix invalid block layouts
authorAri Suonpaa <ari.suonpaa@siru.fi>
Tue, 29 May 2018 06:43:22 +0000 (09:43 +0300)
committerAlexander Galazin <Alexander.Galazin@arm.com>
Thu, 14 Jun 2018 14:08:45 +0000 (10:08 -0400)
Upcoming update in SPIR-V validator checks correctness of block
layouts. This change fixes the errors in 16bit storage tests found
by the new validation rules.

Affects:

dEQP-VK.spirv_assembly.instruction.*.16bit_storage.*

Components: Vulkan

VK-GL-CTS issue: 1184

Change-Id: I1794fdefe044c6e256e189ab5a21c69a69eeb58e

external/vulkancts/modules/vulkan/spirv_assembly/vktSpvAsm16bitStorageTests.cpp

index 2dbc069..cf47cea 100644 (file)
@@ -183,7 +183,7 @@ int getStructSize(const ShaderTemplate  shaderTemplate)
 template<RoundingModeFlags RoundingMode>
 bool graphicsCheck16BitFloats (const std::vector<Resource>&    originalFloats,
                                                           const vector<AllocationSp>&  outputAllocs,
-                                                          const std::vector<Resource>& /* expectedOutputs */,
+                                                          const std::vector<Resource>& expectedOutputs,
                                                           tcu::TestLog&                                log)
 {
        if (outputAllocs.size() != originalFloats.size())
@@ -196,10 +196,11 @@ bool graphicsCheck16BitFloats (const std::vector<Resource>&       originalFloats,
 
                const deUint16* returned        = static_cast<const deUint16*>(outputAllocs[outputNdx]->getHostPtr());
                const float*    original        = reinterpret_cast<const float*>(&originalBytes.front());
-               const deUint32  count           = static_cast<deUint32>(originalBytes.size() / sizeof(float));
+               const deUint32  count           = static_cast<deUint32>(expectedOutputs[outputNdx].second->getByteSize() / sizeof(deUint16));
+               const deUint32  inputStride     = static_cast<deUint32>(originalBytes.size() / sizeof(float)) / count;
 
                for (deUint32 numNdx = 0; numNdx < count; ++numNdx)
-                       if (!compare16BitFloat(original[numNdx], returned[numNdx], RoundingMode, log))
+                       if (!compare16BitFloat(original[numNdx * inputStride], returned[numNdx], RoundingMode, log))
                                return false;
        }
 
@@ -234,7 +235,7 @@ bool computeCheckBuffersFloats (const std::vector<BufferSp>&        originalFloats,
 template<RoundingModeFlags RoundingMode>
 bool computeCheck16BitFloats (const std::vector<BufferSp>&     originalFloats,
                                                          const vector<AllocationSp>&   outputAllocs,
-                                                         const std::vector<BufferSp>&  /* expectedOutputs */,
+                                                         const std::vector<BufferSp>&  expectedOutputs,
                                                          tcu::TestLog&                                 log)
 {
        if (outputAllocs.size() != originalFloats.size())
@@ -247,10 +248,11 @@ bool computeCheck16BitFloats (const std::vector<BufferSp>&        originalFloats,
 
                const deUint16* returned        = static_cast<const deUint16*>(outputAllocs[outputNdx]->getHostPtr());
                const float*    original        = reinterpret_cast<const float*>(&originalBytes.front());
-               const deUint32  count           = static_cast<deUint32>(originalBytes.size() / sizeof(float));
+               const deUint32  count           = static_cast<deUint32>(expectedOutputs[outputNdx]->getByteSize() / sizeof(deUint16));
+               const deUint32  inputStride     = static_cast<deUint32>(originalBytes.size() / sizeof(float)) / count;
 
                for (deUint32 numNdx = 0; numNdx < count; ++numNdx)
-                       if (!compare16BitFloat(original[numNdx], returned[numNdx], RoundingMode, log))
+                       if (!compare16BitFloat(original[numNdx * inputStride], returned[numNdx], RoundingMode, log))
                                return false;
        }
 
@@ -1391,53 +1393,67 @@ void addCompute16bitStorageUniform16To32Group (tcu::TestCaseGroup* group)
                        bool            useConstantIndex;
                        unsigned        constantIndex;
                        unsigned        count;
+                       unsigned        inputStride;
                };
 
-               const CompositeType     cTypes[]        =
+               const CompositeType     cTypes[2][5]            =
                {
-                       {"scalar",                              "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                         false,  0,      numElements},
-                       {"scalar_const_idx_5",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                         true,   5,      numElements},
-                       {"scalar_const_idx_8",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                         true,   8,      numElements},
-                       {"vector",                              "v2f32",        "v2f16",        "OpDecorate %v2f32arr ArrayStride 8\nOpDecorate %v2f16arr ArrayStride 4\n",                     false,  0,      numElements / 2},
-                       {"matrix",                              "v2f32",        "v2f16",        "OpDecorate %m4v2f32arr ArrayStride 32\nOpDecorate %m4v2f16arr ArrayStride 16\n",       false,  0,      numElements / 8},
+                       {
+                               {"scalar",                              "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                                 false,  0,      numElements,            1},
+                               {"scalar_const_idx_5",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                                 true,   5,      numElements,            1},
+                               {"scalar_const_idx_8",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                                 true,   8,      numElements,            1},
+                               {"vector",                              "v2f32",        "v2f16",        "OpDecorate %v2f32arr ArrayStride 8\nOpDecorate %v2f16arr ArrayStride 4\n",                             false,  0,      numElements / 2,        2},
+                               {"matrix",                              "v2f32",        "v2f16",        "OpDecorate %m4v2f32arr ArrayStride 32\nOpDecorate %m4v2f16arr ArrayStride 16\n",               false,  0,      numElements / 8,        8}
+                       },
+                       {
+                               {"scalar",                              "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 16\n",                                false,  0,      numElements,            8},
+                               {"scalar_const_idx_5",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 16\n",                                true,   5,      numElements,            8},
+                               {"scalar_const_idx_8",  "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 16\n",                                true,   8,      numElements,            8},
+                               {"vector",                              "v2f32",        "v2f16",        "OpDecorate %v2f32arr ArrayStride 8\nOpDecorate %v2f16arr ArrayStride 16\n",                    false,  0,      numElements / 2,        8},
+                               {"matrix",                              "v2f32",        "v2f16",        "OpDecorate %m4v2f32arr ArrayStride 32\nOpDecorate %m4v2f16arr ArrayStride 16\n",               false,  0,      numElements / 8,        8}
+                       }
                };
 
-               vector<deFloat16>       float16Data                     = getFloat16s(rnd, numElements);
-               vector<float>           float32Data;
-
-               float32Data.reserve(numElements);
-               for (deUint32 numIdx = 0; numIdx < numElements; ++numIdx)
-                       float32Data.push_back(deFloat16To32(float16Data[numIdx]));
-
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
-                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes); ++tyIdx)
+                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes[capIdx]); ++tyIdx)
                        {
                                ComputeShaderSpec               spec;
                                map<string, string>             specs;
-                               string                                  testName        = string(CAPABILITIES[capIdx].name) + "_" + cTypes[tyIdx].name + "_float";
+                               string                                  testName                = string(CAPABILITIES[capIdx].name) + "_" + cTypes[capIdx][tyIdx].name + "_float";
 
                                specs["capability"]             = CAPABILITIES[capIdx].cap;
                                specs["storage"]                = CAPABILITIES[capIdx].decor;
-                               specs["stride"]                 = cTypes[tyIdx].stride;
-                               specs["base32"]                 = cTypes[tyIdx].base32;
-                               specs["base16"]                 = cTypes[tyIdx].base16;
+                               specs["stride"]                 = cTypes[capIdx][tyIdx].stride;
+                               specs["base32"]                 = cTypes[capIdx][tyIdx].base32;
+                               specs["base16"]                 = cTypes[capIdx][tyIdx].base16;
                                specs["types"]                  = floatTypes;
                                specs["convert"]                = "OpFConvert";
-                               specs["constarrayidx"]  = de::toString(cTypes[tyIdx].constantIndex);
-                               if (cTypes[tyIdx].useConstantIndex)
+                               specs["constarrayidx"]  = de::toString(cTypes[capIdx][tyIdx].constantIndex);
+                               if (cTypes[capIdx][tyIdx].useConstantIndex)
                                        specs["arrayindex"] = "c_i32_ci";
                                else
                                        specs["arrayindex"] = "x";
 
+                               const deUint32                  inputStride             = cTypes[capIdx][tyIdx].inputStride;
+                               const deUint32                  count                   = cTypes[capIdx][tyIdx].count;
+                               const deUint32                  scalarsPerItem  = numElements / count;
+                               vector<deFloat16>               float16Data             = getFloat16s(rnd, numElements * inputStride);
+                               vector<float>                   float32Data;
+
+                               float32Data.reserve(numElements);
+                               for (deUint32 numIdx = 0; numIdx < count; ++numIdx)
+                                       for (deUint32 scalarIdx = 0; scalarIdx < scalarsPerItem; scalarIdx++)
+                                               float32Data.push_back(deFloat16To32(float16Data[numIdx * inputStride + scalarIdx]));
+
                                vector<float>                   float32DataConstIdx;
-                               if (cTypes[tyIdx].useConstantIndex)
+                               if (cTypes[capIdx][tyIdx].useConstantIndex)
                                {
-                                       const deUint32 numFloats = numElements / cTypes[tyIdx].count;
+                                       const deUint32 numFloats = numElements / cTypes[capIdx][tyIdx].count;
                                        for (deUint32 numIdx = 0; numIdx < numElements; ++numIdx)
-                                               float32DataConstIdx.push_back(float32Data[cTypes[tyIdx].constantIndex * numFloats + numIdx % numFloats]);
+                                               float32DataConstIdx.push_back(float32Data[cTypes[capIdx][tyIdx].constantIndex * numFloats + numIdx % numFloats]);
                                }
 
-                               if (strcmp(cTypes[tyIdx].name, "matrix") == 0)
+                               if (strcmp(cTypes[capIdx][tyIdx].name, "matrix") == 0)
                                {
                                        specs["index0"]                 = "%zero";
                                        specs["matrix_prefix"]  = "m4";
@@ -1472,12 +1488,12 @@ void addCompute16bitStorageUniform16To32Group (tcu::TestCaseGroup* group)
                                }
 
                                spec.assembly                   = shaderTemplate.specialize(specs);
-                               spec.numWorkGroups              = IVec3(cTypes[tyIdx].count, 1, 1);
+                               spec.numWorkGroups              = IVec3(cTypes[capIdx][tyIdx].count, 1, 1);
                                spec.verifyIO                   = check32BitFloats;
                                spec.inputTypes[0]              = CAPABILITIES[capIdx].dtype;
 
                                spec.inputs.push_back(BufferSp(new Float16Buffer(float16Data)));
-                               spec.outputs.push_back(BufferSp(new Float32Buffer(cTypes[tyIdx].useConstantIndex ? float32DataConstIdx : float32Data)));
+                               spec.outputs.push_back(BufferSp(new Float32Buffer(cTypes[capIdx][tyIdx].useConstantIndex ? float32DataConstIdx : float32Data)));
                                spec.extensions.push_back("VK_KHR_16bit_storage");
                                spec.requestedVulkanFeatures = get16BitStorageFeatures(CAPABILITIES[capIdx].name);
 
@@ -1522,55 +1538,72 @@ void addCompute16bitStorageUniform16To32Group (tcu::TestCaseGroup* group)
                        bool            useConstantIndex;
                        unsigned        constantIndex;
                        unsigned        count;
+                       unsigned        inputStride;
                };
 
-               const CompositeType     cTypes[]        =
+               const CompositeType     cTypes[2][8]    =
                {
-                       {"scalar_sint",                         true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 false,  0,      numElements},
-                       {"scalar_sint_const_idx_5",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 true,   5,      numElements},
-                       {"scalar_sint_const_idx_8",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 true,   8,      numElements},
-                       {"scalar_uint",                         false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 false,  0,      numElements},
-                       {"scalar_uint_const_idx_5",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 true,   5,      numElements},
-                       {"scalar_uint_const_idx_8",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 true,   8,      numElements},
-                       {"vector_sint",                         true,   sintTypes,      "v4i32",        "v4i16",        "OpSConvert",   "OpDecorate %v4i32arr ArrayStride 16\nOpDecorate %v4i16arr ArrayStride 8\n",    false,  0,      numElements / 4},
-                       {"vector_uint",                         false,  uintTypes,      "v4u32",        "v4u16",        "OpUConvert",   "OpDecorate %v4u32arr ArrayStride 16\nOpDecorate %v4u16arr ArrayStride 8\n",            false,  0,      numElements / 4}
+                       {
+                               {"scalar_sint",                         true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 false,  0,      numElements,            1},
+                               {"scalar_sint_const_idx_5",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 true,   5,      numElements,            1},
+                               {"scalar_sint_const_idx_8",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",                 true,   8,      numElements,            1},
+                               {"scalar_uint",                         false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 false,  0,      numElements,            1},
+                               {"scalar_uint_const_idx_5",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 true,   5,      numElements,            1},
+                               {"scalar_uint_const_idx_8",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",                 true,   8,      numElements,            1},
+                               {"vector_sint",                         true,   sintTypes,      "v4i32",        "v4i16",        "OpSConvert",   "OpDecorate %v4i32arr ArrayStride 16\nOpDecorate %v4i16arr ArrayStride 8\n",    false,  0,      numElements / 4,        4},
+                               {"vector_uint",                         false,  uintTypes,      "v4u32",        "v4u16",        "OpUConvert",   "OpDecorate %v4u32arr ArrayStride 16\nOpDecorate %v4u16arr ArrayStride 8\n",    false,  0,      numElements / 4,        4}
+                       },
+                       {
+                               {"scalar_sint",                         true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 16\n",                false,  0,      numElements,            8},
+                               {"scalar_sint_const_idx_5",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 16\n",                true,   5,      numElements,            8},
+                               {"scalar_sint_const_idx_8",     true,   sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 16\n",                true,   8,      numElements,            8},
+                               {"scalar_uint",                         false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 16\n",                false,  0,      numElements,            8},
+                               {"scalar_uint_const_idx_5",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 16\n",                true,   5,      numElements,            8},
+                               {"scalar_uint_const_idx_8",     false,  uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 16\n",                true,   8,      numElements,            8},
+                               {"vector_sint",                         true,   sintTypes,      "v4i32",        "v4i16",        "OpSConvert",   "OpDecorate %v4i32arr ArrayStride 16\nOpDecorate %v4i16arr ArrayStride 16\n",   false,  0,      numElements / 4,        8},
+                               {"vector_uint",                         false,  uintTypes,      "v4u32",        "v4u16",        "OpUConvert",   "OpDecorate %v4u32arr ArrayStride 16\nOpDecorate %v4u16arr ArrayStride 16\n",   false,  0,      numElements / 4,        8}
+                       }
                };
 
-               vector<deInt16> inputs                  = getInt16s(rnd, numElements);
-               vector<deInt32> sOutputs;
-               vector<deInt32> uOutputs;
-               const deUint16  signBitMask             = 0x8000;
-               const deUint32  signExtendMask  = 0xffff0000;
-
-               sOutputs.reserve(inputs.size());
-               uOutputs.reserve(inputs.size());
-
-               for (deUint32 numNdx = 0; numNdx < inputs.size(); ++numNdx)
-               {
-                       uOutputs.push_back(static_cast<deUint16>(inputs[numNdx]));
-                       if (inputs[numNdx] & signBitMask)
-                               sOutputs.push_back(static_cast<deInt32>(inputs[numNdx] | signExtendMask));
-                       else
-                               sOutputs.push_back(static_cast<deInt32>(inputs[numNdx]));
-               }
-
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
-                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes); ++tyIdx)
+                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes[capIdx]); ++tyIdx)
                        {
                                ComputeShaderSpec       spec;
                                map<string, string>     specs;
-                               string                          testName        = string(CAPABILITIES[capIdx].name) + "_" + cTypes[tyIdx].name;
+                               string                          testName                = string(CAPABILITIES[capIdx].name) + "_" + cTypes[capIdx][tyIdx].name;
+                               const deUint32          inputStride             = cTypes[capIdx][tyIdx].inputStride;
+                               vector<deInt16>         inputs                  = getInt16s(rnd, numElements * inputStride);
+                               vector<deInt32>         sOutputs;
+                               vector<deInt32>         uOutputs;
+                               const deUint16          signBitMask             = 0x8000;
+                               const deUint32          signExtendMask  = 0xffff0000;
+                               const deUint32          count                   = cTypes[capIdx][tyIdx].count;
+                               const deUint32          scalarsPerItem  = numElements / count;
+
+                               sOutputs.reserve(numElements);
+                               uOutputs.reserve(numElements);
+
+                               for (deUint32 numNdx = 0; numNdx < count; ++numNdx)
+                                       for (deUint32 scalarIdx = 0; scalarIdx < scalarsPerItem; ++scalarIdx)
+                                       {
+                                               const deInt16 input = inputs[numNdx * inputStride + scalarIdx];
+
+                                               uOutputs.push_back(static_cast<deUint16>(input));
+                                               if (input & signBitMask)
+                                                       sOutputs.push_back(static_cast<deInt32>(input | signExtendMask));
+                                               else
+                                                       sOutputs.push_back(static_cast<deInt32>(input));
+                                       }
+
                                vector<deInt32>         intDataConstIdx;
 
-                               if (cTypes[tyIdx].useConstantIndex)
+                               if (cTypes[capIdx][tyIdx].useConstantIndex)
                                {
-                                       const deUint32 numInts = numElements / cTypes[tyIdx].count;
-
                                        for (deUint32 numIdx = 0; numIdx < numElements; ++numIdx)
                                        {
-                                               const deInt32 idx = cTypes[tyIdx].constantIndex * numInts + numIdx % numInts;
+                                               const deInt32 idx = cTypes[capIdx][tyIdx].constantIndex * scalarsPerItem + numIdx % scalarsPerItem;
 
-                                               if (cTypes[tyIdx].isSigned)
+                                               if (cTypes[capIdx][tyIdx].isSigned)
                                                        intDataConstIdx.push_back(sOutputs[idx]);
                                                else
                                                        intDataConstIdx.push_back(uOutputs[idx]);
@@ -1579,25 +1612,25 @@ void addCompute16bitStorageUniform16To32Group (tcu::TestCaseGroup* group)
 
                                specs["capability"]             = CAPABILITIES[capIdx].cap;
                                specs["storage"]                = CAPABILITIES[capIdx].decor;
-                               specs["stride"]                 = cTypes[tyIdx].stride;
-                               specs["base32"]                 = cTypes[tyIdx].base32;
-                               specs["base16"]                 = cTypes[tyIdx].base16;
-                               specs["types"]                  = cTypes[tyIdx].types;
-                               specs["convert"]                = cTypes[tyIdx].opcode;
-                               specs["constarrayidx"]  = de::toString(cTypes[tyIdx].constantIndex);
-                               if (cTypes[tyIdx].useConstantIndex)
+                               specs["stride"]                 = cTypes[capIdx][tyIdx].stride;
+                               specs["base32"]                 = cTypes[capIdx][tyIdx].base32;
+                               specs["base16"]                 = cTypes[capIdx][tyIdx].base16;
+                               specs["types"]                  = cTypes[capIdx][tyIdx].types;
+                               specs["convert"]                = cTypes[capIdx][tyIdx].opcode;
+                               specs["constarrayidx"]  = de::toString(cTypes[capIdx][tyIdx].constantIndex);
+                               if (cTypes[capIdx][tyIdx].useConstantIndex)
                                        specs["arrayindex"] = "c_i32_ci";
                                else
                                        specs["arrayindex"] = "x";
 
                                spec.assembly                   = shaderTemplate.specialize(specs);
-                               spec.numWorkGroups              = IVec3(cTypes[tyIdx].count, 1, 1);
+                               spec.numWorkGroups              = IVec3(cTypes[capIdx][tyIdx].count, 1, 1);
                                spec.inputTypes[0]              = CAPABILITIES[capIdx].dtype;
 
                                spec.inputs.push_back(BufferSp(new Int16Buffer(inputs)));
-                               if (cTypes[tyIdx].useConstantIndex)
+                               if (cTypes[capIdx][tyIdx].useConstantIndex)
                                        spec.outputs.push_back(BufferSp(new Int32Buffer(intDataConstIdx)));
-                               else if (cTypes[tyIdx].isSigned)
+                               else if (cTypes[capIdx][tyIdx].isSigned)
                                        spec.outputs.push_back(BufferSp(new Int32Buffer(sOutputs)));
                                else
                                        spec.outputs.push_back(BufferSp(new Int32Buffer(uOutputs)));
@@ -1613,7 +1646,7 @@ void addCompute16bitStorageUniform16To32ChainAccessGroup (tcu::TestCaseGroup* gr
 {
        tcu::TestContext&                               testCtx                 = group->getTestContext();
        de::Random                                              rnd                             (deStringHash(group->getName()));
-       const deUint32                                  structSize              = 24; // In number of 16bit items. Includes padding.
+       const deUint32                                  structSize              = 128; // In number of 16bit items. Includes padding.
        vector<deFloat16>                               inputDataFloat  = getFloat16s(rnd, structSize * 4);
        vector<deInt16>                                 inputDataInt    = getInt16s(rnd, structSize * 4);
        vector<float>                                   outputDataFloat;
@@ -1653,13 +1686,13 @@ void addCompute16bitStorageUniform16To32ChainAccessGroup (tcu::TestCaseGroup* gr
                "                              OpDecorate %Output BufferBlock\n"
                "                              OpDecorate %dataOutput DescriptorSet 0\n"
                "                              OpDecorate %dataOutput Binding 1\n"
-               "                              OpDecorate %scalarArray ArrayStride 2\n"
-               "                              OpDecorate %scalarArray2D ArrayStride 8\n"
+               "                              OpDecorate %scalarArray ArrayStride 16\n"
+               "                              OpDecorate %scalarArray2D ArrayStride 48\n"
                "                              OpMemberDecorate %S 0 Offset 0\n"
-               "                              OpMemberDecorate %S 1 Offset 8\n"
+               "                              OpMemberDecorate %S 1 Offset 48\n"
                "                              ${decoration:opt}\n"
-               "                              OpMemberDecorate %S 2 Offset 40\n"
-               "                              OpDecorate %_arr_S_uint_4 ArrayStride 48\n"
+               "                              OpMemberDecorate %S 2 Offset 240\n"
+               "                              OpDecorate %_arr_S_uint_4 ArrayStride 256\n"
                "                              OpMemberDecorate %Input 0 Offset 0\n"
                "                              OpMemberDecorate %Output 0 Offset 0\n"
                "                              OpDecorate %Input ${storage}\n"
@@ -1724,19 +1757,28 @@ void addCompute16bitStorageUniform16To32ChainAccessGroup (tcu::TestCaseGroup* gr
                const deUint32          signExtendMask          = 0xffff0000;
                // Determine the selected output float for the selected indices.
                const tcu::UVec4        vec                                     = indices[numIdx];
-               // Offsets are in multiples of 16bits.
-               const deUint32          fieldOffsets[3][3]      =
+               // Offsets are in multiples of 16bits. Floats are using matrix as the
+               // second field, which has different layout rules than 2D array.
+               // Therefore separate offset tables are needed.
+               const deUint32          fieldOffsetsFloat[3][3] =
                {
-                       {0u,    1u,     0u},
-                       {4u,    4u,     1u},
-                       {20u,   1u,     0u}
+                       {0u,    8u,             0u},
+                       {24,    24u,    1u},
+                       {120u,  1u,             0u}
                };
-               const deUint32          offset                          = vec.x() * structSize + fieldOffsets[vec.y()][0] + fieldOffsets[vec.y()][1] * vec.z() + fieldOffsets[vec.y()][2] * vec.w();
-               const bool                      hasSign                         = inputDataInt[offset] & signBitMask;
+               const deUint32          fieldOffsetsInt[3][3]   =
+               {
+                       {0u,    8u,             0u},
+                       {24,    24u,    8u},
+                       {120u,  1u,             0u}
+               };
+               const deUint32          offsetFloat                             = vec.x() * structSize + fieldOffsetsFloat[vec.y()][0] + fieldOffsetsFloat[vec.y()][1] * vec.z() + fieldOffsetsFloat[vec.y()][2] * vec.w();
+               const deUint32          offsetInt                               = vec.x() * structSize + fieldOffsetsInt[vec.y()][0] + fieldOffsetsInt[vec.y()][1] * vec.z() + fieldOffsetsInt[vec.y()][2] * vec.w();
+               const bool                      hasSign                                 = inputDataInt[offsetInt] & signBitMask;
 
-               outputDataFloat.push_back(deFloat16To32(inputDataFloat[offset]));
-               outputDataUInt.push_back((deUint16)inputDataInt[offset]);
-               outputDataSInt.push_back((deInt32)(inputDataInt[offset] | (hasSign ? signExtendMask : 0u)));
+               outputDataFloat.push_back(deFloat16To32(inputDataFloat[offsetFloat]));
+               outputDataUInt.push_back((deUint16)inputDataInt[offsetInt]);
+               outputDataSInt.push_back((deInt32)(inputDataInt[offsetInt] | (hasSign ? signExtendMask : 0u)));
        }
 
        for (deUint32 indicesIdx = 0; indicesIdx < (deUint32)indices.size(); ++indicesIdx)
@@ -1790,7 +1832,7 @@ void addCompute16bitStorageUniform16To32ChainAccessGroup (tcu::TestCaseGroup* gr
                                if (dataTypeIdx == 0)
                                {
                                        spec.verifyIO           = check32BitFloats;
-                                       specs["decoration"]     = "OpMemberDecorate %S 1 ColMajor\nOpMemberDecorate %S 1 MatrixStride 8\n";
+                                       specs["decoration"]     = "OpMemberDecorate %S 1 ColMajor\nOpMemberDecorate %S 1 MatrixStride 48\n";
                                }
 
                                spec.assembly                                                   = shaderTemplate.specialize(specs);
@@ -1823,7 +1865,7 @@ void addCompute16bitStoragePushConstant16To32Group (tcu::TestCaseGroup* group)
 
                "${stride}"
 
-               "OpDecorate %PC16 Block\n"
+               "OpDecorate %PC16 BufferBlock\n"
                "OpMemberDecorate %PC16 0 Offset 0\n"
                "OpMemberDecorate %SSBO32 0 Offset 0\n"
                "OpDecorate %SSBO32 BufferBlock\n"
@@ -2102,7 +2144,6 @@ void addGraphics16BitStorageUniformInt32To16Group (tcu::TestCaseGroup* testGroup
        map<string, string>                                     fragments;
        const deUint32                                          numDataPoints           = 256;
        RGBA                                                            defaultColors[4];
-       GraphicsResources                                       resources;
        vector<string>                                          extensions;
        const StringTemplate                            capabilities            ("OpCapability ${cap}\n");
        // inputs and outputs are declared to be vectors of signed integers.
@@ -2116,8 +2157,6 @@ void addGraphics16BitStorageUniformInt32To16Group (tcu::TestCaseGroup* testGroup
        for (deUint32 numNdx = 0; numNdx < inputs.size(); ++numNdx)
                outputs.push_back(static_cast<deInt16>(0xffff & inputs[numNdx]));
 
-       resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int32Buffer(inputs))));
-       resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int16Buffer(outputs))));
 
        extensions.push_back("VK_KHR_16bit_storage");
        fragments["extension"]  = "OpExtension \"SPV_KHR_16bit_storage\"";
@@ -2154,7 +2193,7 @@ void addGraphics16BitStorageUniformInt32To16Group (tcu::TestCaseGroup* testGroup
                        "   %ssbo16 = OpVariable %up_SSBO16 Uniform\n");
 
        const StringTemplate    scalarDecoration(
-                       "OpDecorate %ra_i32 ArrayStride 4\n"
+                       "OpDecorate %ra_i32 ArrayStride ${arraystride}\n"
                        "OpDecorate %ra_i16 ArrayStride 2\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
@@ -2262,26 +2301,57 @@ void addGraphics16BitStorageUniformInt32To16Group (tcu::TestCaseGroup* testGroup
 
                        "OpFunctionEnd\n");
 
-       struct Category
+       // Scalar
        {
-               const char*                             name;
-               const StringTemplate&   preMain;
-               const StringTemplate&   decoration;
-               const StringTemplate&   testFunction;
-       };
+               const deUint32  arrayStrides[]          = {4, 16};
+
+               for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
+                       for (deUint32 factIdx = 0; factIdx < DE_LENGTH_OF_ARRAY(intFacts); ++factIdx)
+                       {
+                               map<string, string>     specs;
+                               string                          name            = string(CAPABILITIES[capIdx].name) + "_scalar_" + intFacts[factIdx].name;
 
-       const Category          categories[]    =
+                               specs["cap"]                                    = CAPABILITIES[capIdx].cap;
+                               specs["indecor"]                                = CAPABILITIES[capIdx].decor;
+                               specs["itype32"]                                = intFacts[factIdx].type32;
+                               specs["v4itype32"]                              = "%v4" + string(intFacts[factIdx].type32).substr(1);
+                               specs["itype16"]                                = intFacts[factIdx].type16;
+                               specs["signed"]                                 = intFacts[factIdx].isSigned;
+                               specs["convert"]                                = intFacts[factIdx].opcode;
+                               specs["arraystride"]                    = de::toString(arrayStrides[capIdx]);
+
+                               fragments["pre_main"]                   = scalarPreMain.specialize(specs);
+                               fragments["testfun"]                    = scalarTestFunc.specialize(specs);
+                               fragments["capability"]                 = capabilities.specialize(specs);
+                               fragments["decoration"]                 = scalarDecoration.specialize(specs);
+
+                               vector<deInt32>         inputsPadded;
+                               for (size_t dataIdx = 0; dataIdx < inputs.size(); ++dataIdx)
+                               {
+                                       inputsPadded.push_back(inputs[dataIdx]);
+                                       for (deUint32 padIdx = 0; padIdx < arrayStrides[capIdx] / 4 - 1; ++padIdx)
+                                               inputsPadded.push_back(0);
+                               }
+                               GraphicsResources       resources;
+                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int32Buffer(inputsPadded))));
+                               resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int16Buffer(outputs))));
+
+                               resources.inputs.back().first   = CAPABILITIES[capIdx].dtype;
+
+                               createTestsForAllStages(name, defaultColors, defaultColors, fragments, resources, extensions, testGroup, get16BitStorageFeatures(CAPABILITIES[capIdx].name));
+                       }
+       }
+       // Vector
        {
-               {"scalar",      scalarPreMain,  scalarDecoration,       scalarTestFunc},
-               {"vector",      vecPreMain,             vecDecoration,          vecTestFunc},
-       };
+               GraphicsResources       resources;
+               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int32Buffer(inputs))));
+               resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int16Buffer(outputs))));
 
-       for (deUint32 catIdx = 0; catIdx < DE_LENGTH_OF_ARRAY(categories); ++catIdx)
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
                        for (deUint32 factIdx = 0; factIdx < DE_LENGTH_OF_ARRAY(intFacts); ++factIdx)
                        {
                                map<string, string>     specs;
-                               string                          name            = string(CAPABILITIES[capIdx].name) + "_" + categories[catIdx].name + "_" + intFacts[factIdx].name;
+                               string                          name            = string(CAPABILITIES[capIdx].name) + "_vector_" + intFacts[factIdx].name;
 
                                specs["cap"]                                    = CAPABILITIES[capIdx].cap;
                                specs["indecor"]                                = CAPABILITIES[capIdx].decor;
@@ -2291,15 +2361,16 @@ void addGraphics16BitStorageUniformInt32To16Group (tcu::TestCaseGroup* testGroup
                                specs["signed"]                                 = intFacts[factIdx].isSigned;
                                specs["convert"]                                = intFacts[factIdx].opcode;
 
-                               fragments["pre_main"]                   = categories[catIdx].preMain.specialize(specs);
-                               fragments["testfun"]                    = categories[catIdx].testFunction.specialize(specs);
+                               fragments["pre_main"]                   = vecPreMain.specialize(specs);
+                               fragments["testfun"]                    = vecTestFunc.specialize(specs);
                                fragments["capability"]                 = capabilities.specialize(specs);
-                               fragments["decoration"]                 = categories[catIdx].decoration.specialize(specs);
+                               fragments["decoration"]                 = vecDecoration.specialize(specs);
 
                                resources.inputs.back().first   = CAPABILITIES[capIdx].dtype;
 
                                createTestsForAllStages(name, defaultColors, defaultColors, fragments, resources, extensions, testGroup, get16BitStorageFeatures(CAPABILITIES[capIdx].name));
                        }
+       }
 }
 
 void addCompute16bitStorageUniform16To16Group (tcu::TestCaseGroup* group)
@@ -2493,36 +2564,44 @@ void addCompute16bitStorageUniform32To16Group (tcu::TestCaseGroup* group)
                        const char*     base16;
                        const char*     stride;
                        unsigned        count;
+                       unsigned        inputStride;
                };
 
-               const CompositeType     cTypes[]        =
+               const CompositeType     cTypes[2][3]    =
                {
-                       {"scalar",      "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                         numElements},
-                       {"vector",      "v4f32",        "v4f16",        "OpDecorate %v4f32arr ArrayStride 16\nOpDecorate %v4f16arr ArrayStride 8\n",            numElements / 4},
-                       {"matrix",      "v4f32",        "v4f16",        "OpDecorate %m2v4f32arr ArrayStride 32\nOpDecorate %m2v4f16arr ArrayStride 16\n",       numElements / 8},
+                       { // BufferBlock
+                               {"scalar",      "f32",          "f16",          "OpDecorate %f32arr ArrayStride 4\nOpDecorate %f16arr ArrayStride 2\n",                         numElements,            1},
+                               {"vector",      "v4f32",        "v4f16",        "OpDecorate %v4f32arr ArrayStride 16\nOpDecorate %v4f16arr ArrayStride 8\n",            numElements / 4,        1},
+                               {"matrix",      "v4f32",        "v4f16",        "OpDecorate %m2v4f32arr ArrayStride 32\nOpDecorate %m2v4f16arr ArrayStride 16\n",       numElements / 8,        1}
+                       },
+                       { // Block
+                               {"scalar",      "f32",          "f16",          "OpDecorate %f32arr ArrayStride 16\nOpDecorate %f16arr ArrayStride 2\n",                        numElements,            4},
+                               {"vector",      "v4f32",        "v4f16",        "OpDecorate %v4f32arr ArrayStride 16\nOpDecorate %v4f16arr ArrayStride 8\n",            numElements / 4,        1},
+                               {"matrix",      "v4f32",        "v4f16",        "OpDecorate %m2v4f32arr ArrayStride 32\nOpDecorate %m2v4f16arr ArrayStride 16\n",       numElements / 8,        1}
+                       }
                };
 
-               vector<float>           float32Data                     = getFloat32s(rnd, numElements);
                vector<deFloat16>       float16DummyData        (numElements, 0);
 
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
-                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes); ++tyIdx)
+                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes[capIdx]); ++tyIdx)
                                for (deUint32 rndModeIdx = 0; rndModeIdx < DE_LENGTH_OF_ARRAY(rndModes); ++rndModeIdx)
                                {
                                        ComputeShaderSpec               spec;
                                        map<string, string>             specs;
-                                       string                                  testName        = string(CAPABILITIES[capIdx].name) + "_" + cTypes[tyIdx].name + "_float_" + rndModes[rndModeIdx].name;
+                                       string                                  testName                        = string(CAPABILITIES[capIdx].name) + "_" + cTypes[capIdx][tyIdx].name + "_float_" + rndModes[rndModeIdx].name;
+                                       vector<float>                   float32Data                     = getFloat32s(rnd, numElements * cTypes[capIdx][tyIdx].inputStride);
 
                                        specs["capability"]             = CAPABILITIES[capIdx].cap;
                                        specs["storage"]                = CAPABILITIES[capIdx].decor;
-                                       specs["stride"]                 = cTypes[tyIdx].stride;
-                                       specs["base32"]                 = cTypes[tyIdx].base32;
-                                       specs["base16"]                 = cTypes[tyIdx].base16;
+                                       specs["stride"]                 = cTypes[capIdx][tyIdx].stride;
+                                       specs["base32"]                 = cTypes[capIdx][tyIdx].base32;
+                                       specs["base16"]                 = cTypes[capIdx][tyIdx].base16;
                                        specs["rounding"]               = rndModes[rndModeIdx].decor;
                                        specs["types"]                  = floatTypes;
                                        specs["convert"]                = "OpFConvert";
 
-                                       if (strcmp(cTypes[tyIdx].name, "matrix") == 0)
+                                       if (strcmp(cTypes[capIdx][tyIdx].name, "matrix") == 0)
                                        {
                                                if (strcmp(rndModes[rndModeIdx].name, "rtz") == 0)
                                                        specs["rounding"] += "\nOpDecorate %val16_1  FPRoundingMode RTZ\n";
@@ -2550,7 +2629,7 @@ void addCompute16bitStorageUniform32To16Group (tcu::TestCaseGroup* group)
                                        }
 
                                        spec.assembly                   = shaderTemplate.specialize(specs);
-                                       spec.numWorkGroups              = IVec3(cTypes[tyIdx].count, 1, 1);
+                                       spec.numWorkGroups              = IVec3(cTypes[capIdx][tyIdx].count, 1, 1);
                                        spec.verifyIO                   = rndModes[rndModeIdx].func;
                                        spec.inputTypes[0]              = CAPABILITIES[capIdx].dtype;
 
@@ -2599,40 +2678,53 @@ void addCompute16bitStorageUniform32To16Group (tcu::TestCaseGroup* group)
                        const char* opcode;
                        const char*     stride;
                        unsigned        count;
+                       unsigned        inputStride;
                };
 
-               const CompositeType     cTypes[]        =
+               const CompositeType     cTypes[2][4]    =
                {
-                       {"scalar_sint", sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",         numElements},
-                       {"scalar_uint", uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",         numElements},
-                       {"vector_sint", sintTypes,      "v2i32",        "v2i16",        "OpSConvert",   "OpDecorate %v2i32arr ArrayStride 8\nOpDecorate %v2i16arr ArrayStride 4\n",     numElements / 2},
-                       {"vector_uint", uintTypes,      "v2u32",        "v2u16",        "OpUConvert",   "OpDecorate %v2u32arr ArrayStride 8\nOpDecorate %v2u16arr ArrayStride 4\n",     numElements / 2},
+                       {
+                               {"scalar_sint", sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 4\nOpDecorate %i16arr ArrayStride 2\n",         numElements,                    1},
+                               {"scalar_uint", uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 4\nOpDecorate %u16arr ArrayStride 2\n",         numElements,                    1},
+                               {"vector_sint", sintTypes,      "v2i32",        "v2i16",        "OpSConvert",   "OpDecorate %v2i32arr ArrayStride 8\nOpDecorate %v2i16arr ArrayStride 4\n",     numElements / 2,                2},
+                               {"vector_uint", uintTypes,      "v2u32",        "v2u16",        "OpUConvert",   "OpDecorate %v2u32arr ArrayStride 8\nOpDecorate %v2u16arr ArrayStride 4\n",     numElements / 2,                2}
+                       },
+                       {
+                               {"scalar_sint", sintTypes,      "i32",          "i16",          "OpSConvert",   "OpDecorate %i32arr ArrayStride 16\nOpDecorate %i16arr ArrayStride 2\n",                numElements,            4},
+                               {"scalar_uint", uintTypes,      "u32",          "u16",          "OpUConvert",   "OpDecorate %u32arr ArrayStride 16\nOpDecorate %u16arr ArrayStride 2\n",                numElements,            4},
+                               {"vector_sint", sintTypes,      "v2i32",        "v2i16",        "OpSConvert",   "OpDecorate %v2i32arr ArrayStride 16\nOpDecorate %v2i16arr ArrayStride 4\n",    numElements / 2,        4},
+                               {"vector_uint", uintTypes,      "v2u32",        "v2u16",        "OpUConvert",   "OpDecorate %v2u32arr ArrayStride 16\nOpDecorate %v2u16arr ArrayStride 4\n",    numElements / 2,        4}
+                       }
                };
 
-               vector<deInt32> inputs                  = getInt32s(rnd, numElements);
-               vector<deInt16> outputs;
-
-               outputs.reserve(inputs.size());
-               for (deUint32 numNdx = 0; numNdx < inputs.size(); ++numNdx)
-                       outputs.push_back(static_cast<deInt16>(0xffff & inputs[numNdx]));
-
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
-                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes); ++tyIdx)
+                       for (deUint32 tyIdx = 0; tyIdx < DE_LENGTH_OF_ARRAY(cTypes[capIdx]); ++tyIdx)
                        {
                                ComputeShaderSpec               spec;
                                map<string, string>             specs;
-                               string                                  testName        = string(CAPABILITIES[capIdx].name) + "_" + cTypes[tyIdx].name;
+                               string                                  testName                = string(CAPABILITIES[capIdx].name) + "_" + cTypes[capIdx][tyIdx].name;
+                               const deUint32                  inputStride             = cTypes[capIdx][tyIdx].inputStride;
+                               const deUint32                  count                   = cTypes[capIdx][tyIdx].count;
+                               const deUint32                  scalarsPerItem  = numElements / count;
+
+                               vector<deInt32> inputs                                  = getInt32s(rnd, numElements * inputStride);
+                               vector<deInt16> outputs;
+
+                               outputs.reserve(numElements);
+                               for (deUint32 numNdx = 0; numNdx < count; ++numNdx)
+                                       for (deUint32 scalarIdx = 0; scalarIdx < scalarsPerItem; scalarIdx++)
+                                               outputs.push_back(static_cast<deInt16>(0xffff & inputs[numNdx * inputStride + scalarIdx]));
 
                                specs["capability"]             = CAPABILITIES[capIdx].cap;
                                specs["storage"]                = CAPABILITIES[capIdx].decor;
-                               specs["stride"]                 = cTypes[tyIdx].stride;
-                               specs["base32"]                 = cTypes[tyIdx].base32;
-                               specs["base16"]                 = cTypes[tyIdx].base16;
-                               specs["types"]                  = cTypes[tyIdx].types;
-                               specs["convert"]                = cTypes[tyIdx].opcode;
+                               specs["stride"]                 = cTypes[capIdx][tyIdx].stride;
+                               specs["base32"]                 = cTypes[capIdx][tyIdx].base32;
+                               specs["base16"]                 = cTypes[capIdx][tyIdx].base16;
+                               specs["types"]                  = cTypes[capIdx][tyIdx].types;
+                               specs["convert"]                = cTypes[capIdx][tyIdx].opcode;
 
                                spec.assembly                   = shaderTemplate.specialize(specs);
-                               spec.numWorkGroups              = IVec3(cTypes[tyIdx].count, 1, 1);
+                               spec.numWorkGroups              = IVec3(cTypes[capIdx][tyIdx].count, 1, 1);
                                spec.inputTypes[0]              = CAPABILITIES[capIdx].dtype;
 
                                spec.inputs.push_back(BufferSp(new Int32Buffer(inputs)));
@@ -3300,17 +3392,21 @@ void addGraphics16BitStorageUniformFloat32To16Group (tcu::TestCaseGroup* testGro
 {
        de::Random                                                      rnd                                     (deStringHash(testGroup->getName()));
        map<string, string>                                     fragments;
-       GraphicsResources                                       resources;
        vector<string>                                          extensions;
        const deUint32                                          numDataPoints           = 256;
        RGBA                                                            defaultColors[4];
-       vector<float>                                           float32Data                     = getFloat32s(rnd, numDataPoints);
+       const vector<float>                                     float32Data                     = getFloat32s(rnd, numDataPoints);
+       vector<float>                                           float32DataPadded;
        vector<deFloat16>                                       float16DummyData        (numDataPoints, 0);
        const StringTemplate                            capabilities            ("OpCapability ${cap}\n");
 
-       resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float32Buffer(float32Data))));
-       // We use a custom verifyIO to check the result via computing directly from inputs; the contents in outputs do not matter.
-       resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(float16DummyData))));
+       for (size_t dataIdx = 0; dataIdx < float32Data.size(); ++dataIdx)
+       {
+               float32DataPadded.push_back(float32Data[dataIdx]);
+               float32DataPadded.push_back(0.0f);
+               float32DataPadded.push_back(0.0f);
+               float32DataPadded.push_back(0.0f);
+       }
 
        extensions.push_back("VK_KHR_16bit_storage");
        fragments["extension"]  = "OpExtension \"SPV_KHR_16bit_storage\"";
@@ -3340,7 +3436,7 @@ void addGraphics16BitStorageUniformFloat32To16Group (tcu::TestCaseGroup* testGro
                        "   %ssbo16 = OpVariable %up_SSBO16 Uniform\n";
 
                const StringTemplate decoration         (
-                       "OpDecorate %ra_f32 ArrayStride 4\n"
+                       "OpDecorate %ra_f32 ArrayStride ${arraystride}\n"
                        "OpDecorate %ra_f16 ArrayStride 2\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
@@ -3387,21 +3483,29 @@ void addGraphics16BitStorageUniformFloat32To16Group (tcu::TestCaseGroup* testGro
 
                        "OpFunctionEnd\n";
 
-               const RndMode   rndModes[] =
+               const RndMode   rndModes[]                      =
                {
                        {"rtz",                                         "OpDecorate %val16  FPRoundingMode RTZ",        graphicsCheck16BitFloats<ROUNDINGMODE_RTZ>},
                        {"rte",                                         "OpDecorate %val16  FPRoundingMode RTE",        graphicsCheck16BitFloats<ROUNDINGMODE_RTE>},
                        {"unspecified_rnd_mode",        "",                                                                                     graphicsCheck16BitFloats<RoundingModeFlags(ROUNDINGMODE_RTE | ROUNDINGMODE_RTZ)>},
                };
 
+               const deUint32  arrayStrides[]          = {4, 16};
+
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
                        for (deUint32 rndModeIdx = 0; rndModeIdx < DE_LENGTH_OF_ARRAY(rndModes); ++rndModeIdx)
                        {
                                map<string, string>     specs;
                                string                          testName        = string(CAPABILITIES[capIdx].name) + "_scalar_float_" + rndModes[rndModeIdx].name;
 
+                               GraphicsResources       resources;
+                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float32Buffer(arrayStrides[capIdx] == 4 ? float32Data : float32DataPadded))));
+                               // We use a custom verifyIO to check the result via computing directly from inputs; the contents in outputs do not matter.
+                               resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(float16DummyData))));
+
                                specs["cap"]                                    = CAPABILITIES[capIdx].cap;
                                specs["indecor"]                                = CAPABILITIES[capIdx].decor;
+                               specs["arraystride"]                    = de::toString(arrayStrides[capIdx]);
                                specs["rounddecor"]                             = rndModes[rndModeIdx].decor;
 
                                fragments["capability"]                 = capabilities.specialize(specs);
@@ -3410,11 +3514,16 @@ void addGraphics16BitStorageUniformFloat32To16Group (tcu::TestCaseGroup* testGro
                                resources.inputs.back().first   = CAPABILITIES[capIdx].dtype;
                                resources.verifyIO                              = rndModes[rndModeIdx].f;
 
-
                                createTestsForAllStages(testName, defaultColors, defaultColors, fragments, resources, extensions, testGroup, get16BitStorageFeatures(CAPABILITIES[capIdx].name));
                        }
        }
 
+       // Non-scalar cases can use the same resources.
+       GraphicsResources       resources;
+       resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float32Buffer(float32Data))));
+       // We use a custom verifyIO to check the result via computing directly from inputs; the contents in outputs do not matter.
+       resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(float16DummyData))));
+
        {  // vector cases
                fragments["pre_main"]                           =
                        "      %f16 = OpTypeFloat 16\n"
@@ -3737,7 +3846,7 @@ void addGraphics16BitStorageInputOutputFloat32To16Group (tcu::TestCaseGroup* tes
 
                        for (deUint32 caseNdx = 0; caseNdx < numDataPoints / numPerCase; ++caseNdx)
                        {
-                               string                  testName        = string(cases[caseIdx].name) + numberToString(caseNdx) + "_" + rndModes[rndModeIdx].name;
+                               string          testName        = string(cases[caseIdx].name) + numberToString(caseNdx) + "_" + rndModes[rndModeIdx].name;
 
                                for (deUint32 numNdx = 0; numNdx < numPerCase; ++numNdx)
                                {
@@ -4457,7 +4566,7 @@ void addGraphics16BitStoragePushConstantFloat16To32Group (tcu::TestCaseGroup* te
                        "OpDecorate %a64f32 ArrayStride 4\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
-                       "OpDecorate %PC16 Block\n"
+                       "OpDecorate %PC16 BufferBlock\n"
                        "OpMemberDecorate %PC16 0 Offset 0\n"
                        "OpDecorate %ssbo32 DescriptorSet 0\n"
                        "OpDecorate %ssbo32 Binding 0\n";
@@ -4526,7 +4635,7 @@ void addGraphics16BitStoragePushConstantFloat16To32Group (tcu::TestCaseGroup* te
                        "OpDecorate %a16v4f32 ArrayStride 16\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
-                       "OpDecorate %PC16 Block\n"
+                       "OpDecorate %PC16 BufferBlock\n"
                        "OpMemberDecorate %PC16 0 Offset 0\n"
                        "OpDecorate %ssbo32 DescriptorSet 0\n"
                        "OpDecorate %ssbo32 Binding 0\n";
@@ -4599,7 +4708,7 @@ void addGraphics16BitStoragePushConstantFloat16To32Group (tcu::TestCaseGroup* te
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO32 0 ColMajor\n"
                        "OpMemberDecorate %SSBO32 0 MatrixStride 16\n"
-                       "OpDecorate %PC16 Block\n"
+                       "OpDecorate %PC16 BufferBlock\n"
                        "OpMemberDecorate %PC16 0 Offset 0\n"
                        "OpMemberDecorate %PC16 0 ColMajor\n"
                        "OpMemberDecorate %PC16 0 MatrixStride 8\n"
@@ -4766,7 +4875,7 @@ void addGraphics16BitStoragePushConstantInt16To32Group (tcu::TestCaseGroup* test
                        "OpDecorate %a${count}${type32} ArrayStride 4\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
-                       "OpDecorate %PC16 Block\n"
+                       "OpDecorate %PC16 BufferBlock\n"
                        "OpMemberDecorate %PC16 0 Offset 0\n"
                        "OpDecorate %ssbo32 DescriptorSet 0\n"
                        "OpDecorate %ssbo32 Binding 0\n");
@@ -4881,7 +4990,7 @@ void addGraphics16BitStoragePushConstantInt16To32Group (tcu::TestCaseGroup* test
                        "OpDecorate %a${count}${type32} ArrayStride 8\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
-                       "OpDecorate %PC16 Block\n"
+                       "OpDecorate %PC16 BufferBlock\n"
                        "OpMemberDecorate %PC16 0 Offset 0\n"
                        "OpDecorate %ssbo32 DescriptorSet 0\n"
                        "OpDecorate %ssbo32 Binding 0\n");
@@ -4986,7 +5095,6 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
        vector<deInt16>                                         inputs                          = getInt16s(rnd, numDataPoints);
        vector<deInt32>                                         sOutputs;
        vector<deInt32>                                         uOutputs;
-       GraphicsResources                                       resources;
        vector<string>                                          extensions;
        const deUint16                                          signBitMask                     = 0x8000;
        const deUint32                                          signExtendMask          = 0xffff0000;
@@ -5004,8 +5112,6 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
                        sOutputs.push_back(static_cast<deInt32>(inputs[numNdx]));
        }
 
-       resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int16Buffer(inputs))));
-
        extensions.push_back("VK_KHR_16bit_storage");
        fragments["extension"]  = "OpExtension \"SPV_KHR_16bit_storage\"";
 
@@ -5057,7 +5163,7 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
 
        const StringTemplate scalarDecoration           (
                        "OpDecorate %ra_i32 ArrayStride 4\n"
-                       "OpDecorate %ra_i16 ArrayStride 2\n"
+                       "OpDecorate %ra_i16 ArrayStride ${arraystride}\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
@@ -5119,7 +5225,7 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
 
        const StringTemplate vecDecoration              (
                        "OpDecorate %ra_v2i32 ArrayStride 8\n"
-                       "OpDecorate %ra_v2i16 ArrayStride 4\n"
+                       "OpDecorate %ra_v2i16 ArrayStride ${arraystride}\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
@@ -5129,7 +5235,7 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
                        "OpDecorate %ssbo32 Binding 1\n"
                        "OpDecorate %ssbo16 Binding 0\n");
 
-       const StringTemplate vecTestFunc        (
+       const StringTemplate vecTestFunc                (
                        "%test_code = OpFunction %v4f32 None %v4f32_v4f32_function\n"
                        "    %param = OpFunctionParameter %v4f32\n"
 
@@ -5172,12 +5278,14 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
                const deUint32                  numElements;
        };
 
-       const Category          categories[]    =
+       const Category          categories[]            =
        {
                {"scalar",      scalarPreMain,  scalarDecoration,       scalarTestFunc, 1},
                {"vector",      vecPreMain,             vecDecoration,          vecTestFunc,    2},
        };
 
+       const deUint32          minArrayStride[]        = {2, 16};
+
        for (deUint32 catIdx = 0; catIdx < DE_LENGTH_OF_ARRAY(categories); ++catIdx)
                for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
                        for (deUint32 factIdx = 0; factIdx < DE_LENGTH_OF_ARRAY(intFacts); ++factIdx)
@@ -5187,9 +5295,12 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
                                        deUint32                        constIdx                = constantIndices[constIndexIdx].constantIndex;
                                        map<string, string>     specs;
                                        string                          name                    = string(CAPABILITIES[capIdx].name) + "_" + categories[catIdx].name + "_" + intFacts[factIdx].name;
+                                       const deUint32          numElements             = categories[catIdx].numElements;
+                                       const deUint32          arrayStride             = de::max(numElements * 2, minArrayStride[capIdx]);
 
                                        specs["cap"]                                            = CAPABILITIES[capIdx].cap;
                                        specs["indecor"]                                        = CAPABILITIES[capIdx].decor;
+                                       specs["arraystride"]                            = de::toString(arrayStride);
                                        specs["itype32"]                                        = intFacts[factIdx].type32;
                                        specs["v2itype32"]                                      = "%v2" + string(intFacts[factIdx].type32).substr(1);
                                        specs["v3itype32"]                                      = "%v3" + string(intFacts[factIdx].type32).substr(1);
@@ -5210,10 +5321,21 @@ void addGraphics16BitStorageUniformInt16To32Group (tcu::TestCaseGroup* testGroup
                                        fragments["capability"]                         = capabilities.specialize(specs);
                                        fragments["decoration"]                         = categories[catIdx].decoration.specialize(specs);
 
+                                       GraphicsResources       resources;
+                                       vector<deInt16>         inputsPadded;
+                                       for (size_t dataIdx = 0; dataIdx < inputs.size() / numElements; ++dataIdx)
+                                       {
+                                               for (deUint32 elementIdx = 0; elementIdx < numElements; ++elementIdx)
+                                                       inputsPadded.push_back(inputs[dataIdx * numElements + elementIdx]);
+                                               for (deUint32 padIdx = 0; padIdx < arrayStride / 2 - numElements; ++padIdx)
+                                                       inputsPadded.push_back(0);
+                                       }
+
+                                       resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Int16Buffer(inputsPadded))));
+
                                        vector<deInt32>         constIdxOutputs;
                                        if (useConstIdx)
                                        {
-                                               const deUint32 numElements = categories[catIdx].numElements;
                                                name += string("_const_idx_") + de::toString(constIdx);
                                                for (deUint32 i = 0; i < numDataPoints; i++)
                                                {
@@ -5282,7 +5404,7 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                const StringTemplate decoration         (
                        "OpDecorate %ra_f32 ArrayStride 4\n"
-                       "OpDecorate %ra_f16 ArrayStride 2\n"
+                       "OpDecorate %ra_f16 ArrayStride ${arraystride}\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
@@ -5328,6 +5450,8 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                        "OpFunctionEnd\n");
 
+               const deUint32  arrayStrides[]          = {2, 16};
+
                for (deUint32 constIndexIdx = 0; constIndexIdx < DE_LENGTH_OF_ARRAY(constantIndices); ++constIndexIdx)
                {
                        for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
@@ -5340,6 +5464,7 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                                specs["cap"]                                    = CAPABILITIES[capIdx].cap;
                                specs["indecor"]                                = CAPABILITIES[capIdx].decor;
+                               specs["arraystride"]                    = de::toString(arrayStrides[capIdx]);
                                specs["constarrayidx"]                  = de::toString(constIdx);
                                if (useConstIdx)
                                        specs["arrayindex"] = "c_i32_ci";
@@ -5351,12 +5476,20 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
                                fragments["pre_main"]                   = preMain.specialize(specs);
                                fragments["testfun"]                    = testFun.specialize(specs);
 
+                               vector<deFloat16>       inputData;
+                               for (size_t dataIdx = 0; dataIdx < float16Data.size(); ++dataIdx)
+                               {
+                                       inputData.push_back(float16Data[dataIdx]);
+                                       for (deUint32 padIdx = 0; padIdx < arrayStrides[capIdx] / 2 - 1; ++padIdx)
+                                               inputData.push_back(deFloat16(0.0f));
+                               }
+
                                vector<float>           float32Data;
                                float32Data.reserve(numDataPoints);
                                for (deUint32 numIdx = 0; numIdx < numDataPoints; ++numIdx)
                                        float32Data.push_back(deFloat16To32(float16Data[useConstIdx ? constIdx : numIdx]));
 
-                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(float16Data))));
+                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(inputData))));
                                resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float32Buffer(float32Data))));
                                resources.verifyIO = check32BitFloats;
                                resources.inputs.back().first   = CAPABILITIES[capIdx].dtype;
@@ -5388,7 +5521,7 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                const StringTemplate decoration         (
                        "OpDecorate %ra_v2f32 ArrayStride 8\n"
-                       "OpDecorate %ra_v2f16 ArrayStride 4\n"
+                       "OpDecorate %ra_v2f16 ArrayStride ${arraystride}\n"
                        "OpMemberDecorate %SSBO32 0 Offset 0\n"
                        "OpMemberDecorate %SSBO16 0 Offset 0\n"
                        "OpDecorate %SSBO32 BufferBlock\n"
@@ -5434,6 +5567,8 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                        "OpFunctionEnd\n");
 
+               const deUint32  arrayStrides[]          = {4, 16};
+
                for (deUint32 constIndexIdx = 0; constIndexIdx < DE_LENGTH_OF_ARRAY(constantIndices); ++constIndexIdx)
                {
                        for (deUint32 capIdx = 0; capIdx < DE_LENGTH_OF_ARRAY(CAPABILITIES); ++capIdx)
@@ -5446,6 +5581,7 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
 
                                specs["cap"]                                    = CAPABILITIES[capIdx].cap;
                                specs["indecor"]                                = CAPABILITIES[capIdx].decor;
+                               specs["arraystride"]                    = de::toString(arrayStrides[capIdx]);
                                specs["constarrayidx"]                  = de::toString(constIdx);
                                if (useConstIdx)
                                        specs["arrayindex"] = "c_i32_ci";
@@ -5457,12 +5593,21 @@ void addGraphics16BitStorageUniformFloat16To32Group (tcu::TestCaseGroup* testGro
                                fragments["pre_main"]                   = preMain.specialize(specs);
                                fragments["testfun"]                    = testFun.specialize(specs);
 
+                               vector<deFloat16>       inputData;
+                               for (size_t dataIdx = 0; dataIdx < float16Data.size() / 2; ++dataIdx)
+                               {
+                                       inputData.push_back(float16Data[dataIdx * 2]);
+                                       inputData.push_back(float16Data[dataIdx * 2 + 1]);
+                                       for (deUint32 padIdx = 0; padIdx < arrayStrides[capIdx] / 2 - 2; ++padIdx)
+                                               inputData.push_back(deFloat16(0.0f));
+                               }
+
                                vector<float>           float32Data;
                                float32Data.reserve(numDataPoints);
                                for (deUint32 numIdx = 0; numIdx < numDataPoints; ++numIdx)
                                        float32Data.push_back(deFloat16To32(float16Data[constantIndices[constIndexIdx].useConstantIndex ? (constantIndices[constIndexIdx].constantIndex * 2 + numIdx % 2) : numIdx]));
 
-                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(float16Data))));
+                               resources.inputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float16Buffer(inputData))));
                                resources.outputs.push_back(std::make_pair(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, BufferSp(new Float32Buffer(float32Data))));
                                resources.verifyIO = check32BitFloats;
                                resources.inputs.back().first   = CAPABILITIES[capIdx].dtype;