Vulkan之VertexBuffer

vertexBuffer,它是一个Buffer,这个buffer装的是vertex信息。所以实际上我们的所有的图元的顶点信息都是放到了vertexBuffer中。而对于vulkan而言,内存的管理是由我们自己控制的,因此内存管理是一项很重要的内容。
扯远了,下面我们就来看下vertexBuffer的创建,copy以及销毁过程。

1) buffer的创建

1
2
3
4
5
VKAPI_ATTR VkResult VKAPI_CALL vkCreateBuffer(
VkDevice device,
const VkBufferCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator,
VkBuffer* pBuffer);
1
2
3
4
5
6
7
8
9
10
typedef struct VkBufferCreateInfo {
VkStructureType sType; // VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO
const void* pNext; // nullptr
VkBufferCreateFlags flags; // 与稀疏缓冲区的创建有关
VkDeviceSize size; // 需要创建的buffer的大小
VkBufferUsageFlags usage; // 如何使用缓冲区
VkSharingMode sharingMode; // 是否可以多个队列共享使用
uint32_t queueFamilyIndexCount;
const uint32_t* pQueueFamilyIndices; // 那些队列会用到buffer
} VkBufferCreateInfo;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
typedef enum VkBufferUsageFlagBits {
VK_BUFFER_USAGE_TRANSFER_SRC_BIT = 0x00000001,
VK_BUFFER_USAGE_TRANSFER_DST_BIT = 0x00000002,
VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT = 0x00000004,
VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT = 0x00000008,
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT = 0x00000010,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT = 0x00000020,
VK_BUFFER_USAGE_INDEX_BUFFER_BIT = 0x00000040,
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT = 0x00000080,
VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT = 0x00000100,
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT = 0x00020000,
VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT = 0x00000800,
VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT = 0x00001000,
VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT = 0x00000200,
VK_BUFFER_USAGE_RAY_TRACING_BIT_KHR = 0x00000400,
VK_BUFFER_USAGE_RAY_TRACING_BIT_NV = VK_BUFFER_USAGE_RAY_TRACING_BIT_KHR,
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
VK_BUFFER_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
} VkBufferUsageFlagBits;
1
2
3
4
5
typedef enum VkSharingMode {
VK_SHARING_MODE_EXCLUSIVE = 0, // buffer只能被一个队列使用
VK_SHARING_MODE_CONCURRENT = 1, // buffer可被多个队列使用,这样的效率可能不高
VK_SHARING_MODE_MAX_ENUM = 0x7FFFFFFF
} VkSharingMode;

当VkSharingMode为VK_SHARING_MODE_CONCURRENT需要设置pQueueFamilyIndices,表示那些队列会用到数据。
当VkSharingMode为VK_SHARING_MODE_EXCLUSIVE则不需要设置pQueueFamilyIndices。

2) 设备内存的分配

上面虽然创建了vertexBuffer,但是实际上没有为其分配设备内存,也就是说GPU实际上还是不能访问到这块内存的。因此我们需要在设备上创建内存,并将其绑定

第一步首先要查询vertexBuffer需要的设备内存的大小

1
2
3
4
VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements(
VkDevice device,
VkBuffer buffer,
VkMemoryRequirements* pMemoryRequirements);

第二步分配设备内存

1
2
3
4
5
VKAPI_ATTR VkResult VKAPI_CALL vkAllocateMemory(
VkDevice device,
const VkMemoryAllocateInfo* pAllocateInfo,
const VkAllocationCallbacks* pAllocator,
VkDeviceMemory* pMemory);

1
2
3
4
5
6
typedef struct VkMemoryAllocateInfo {
VkStructureType sType; // VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO
const void* pNext;
VkDeviceSize allocationSize; // 上面查询到的内存大小
uint32_t memoryTypeIndex; // 内存类型
} VkMemoryAllocateInfo;

第三步绑定vertexBuffer和创建的设备内存pMemory

1
2
3
4
5
VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory(
VkDevice device,
VkBuffer buffer,
VkDeviceMemory memory,
VkDeviceSize memoryOffset);

最后,我们需要注意一下第二步中的memoryTypeIndex,这个是一个位Mask,这个首先需要查找设备支持的内存类型,然后看是否符合我们的预期

1
2
3
VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties(
VkPhysicalDevice physicalDevice,
VkPhysicalDeviceMemoryProperties* pMemoryProperties);

一般的类型有

1
2
3
4
5
6
7
8
9
10
11
12
typedef enum VkMemoryPropertyFlagBits {
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT = 0x00000001,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT = 0x00000002,
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT = 0x00000004,
VK_MEMORY_PROPERTY_HOST_CACHED_BIT = 0x00000008,
VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT = 0x00000010,
VK_MEMORY_PROPERTY_PROTECTED_BIT = 0x00000020,
VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD = 0x00000040,
VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD = 0x00000080,
VK_MEMORY_PROPERTY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
} VkMemoryPropertyFlagBits;
typedef VkFlags VkMemoryPropertyFlags;

这个后面我们会专门有一节讲解内存管理,这里先略过。

3)填充顶点缓冲区

实际应用中,我们的顶点数据都是通过文件读取,加载到内存中的,但是这个GPU看不到啊,因此需要一个copy的操作,将其从host内存拷贝到device的显存上。
而这个copy也不能直接copy,因为CPU也看不到GPU的缓存啊。因此需要首先将device显存映射到CPU可访问的内存中,然后执行copy操作,然后解除映射。

1
2
3
4
void* data;
vkMapMemory(device, vertexBufferMemory, 0, bufferInfo.size, 0, &data);
memcpy(data, vertices.data(), (size_t) bufferInfo.size);
vkUnmapMemory(device, vertexBufferMemory);

映射函数:

1
2
3
4
5
6
7
VKAPI_ATTR VkResult VKAPI_CALL vkMapMemory(
VkDevice device,
VkDeviceMemory memory,
VkDeviceSize offset, // 偏移量,一般为0则表示映射整个内存
VkDeviceSize size,
VkMemoryMapFlags flags, // 设置为0,这个值当前不用
void** ppData);

解映射函数:

1
2
3
VKAPI_ATTR void VKAPI_CALL vkUnmapMemory(
VkDevice device,
VkDeviceMemory memory);

4)stagingBuffer

实际上上面的操作已经做完了,那么我们这里提到一个stagingBuffer是干嘛的呢?

这主要是因为我们创建的vertexBuffer,因为它是要给vertex Shader频繁读取的,因此他的VkMemoryPropertyFlagBits必须设置为VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT会更快的被GPU读取
而CPU是无法向VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT这个类型的buffer copy数据的。如果要实现上面的操作,那么必须将vertexBuffer设置为VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
因此我们可以通过加一层缓存的方式来解决这个copy问题。

也就是我们先将顶点信息copy到stagingBuffer上,然后再将其transfer到vertexBuffer中。这里就引入了一个额外的transfer操作。

因此是显存对显存的操作,所以不能直接memcpy,应用下面的命令

1
2
3
4
5
6
VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer(
VkCommandBuffer commandBuffer,
VkBuffer srcBuffer,
VkBuffer dstBuffer,
uint32_t regionCount,
const VkBufferCopy* pRegions);

既然是command,那么必须要有commandbuffer的那一套流程。

5) 关于IndexBuffer

indexBuffer的流程更vertexBuffer完全一样,这里就不多解释。

6) 实际代码分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void createVertexBuffer() {
VkDeviceSize bufferSize = sizeof(vertices[0]) * vertices.size();

VkBuffer stagingBuffer;
VkDeviceMemory stagingBufferMemory;
createBuffer(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBuffer, stagingBufferMemory);

void* data;
vkMapMemory(device, stagingBufferMemory, 0, bufferSize, 0, &data);
memcpy(data, vertices.data(), (size_t)bufferSize);
vkUnmapMemory(device, stagingBufferMemory);

createBuffer(bufferSize, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, vertexBuffer, vertexBufferMemory);

copyBuffer(stagingBuffer, vertexBuffer, bufferSize);

vkDestroyBuffer(device, stagingBuffer, nullptr);
vkFreeMemory(device, stagingBufferMemory, nullptr);
}

这个函数中先创建了stagingBuffer和stagingBufferMemory,注意他们的usage和property
VK_BUFFER_USAGE_TRANSFER_SRC_BIT表明它将会作为后续vkCmdCopyBuffer的源
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT 使用这个标记的内存可以被映射到CPU(HOST)上,这意味着不带有这个标记的内存,是不能直接被CPU访问的,它必须要先间接地转移到其他可被映射到CPU的内存中,才能再被CPU访问
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT 使用这个标记的内存在被映射到CPU上后,再CPU上对它进行的操作,不会被Cache,对它做的一切修改,不需要手动地写回

然后创建了vertexBuffer和vertexBufferMemory,注意他们的usage和property
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT 用作传输的目的 以及 能够用于vkCmdBindVertexBuffers
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT 只能被GPU访问,提高访问效率

这里创建buffer的函数被抽象出来了,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory) {
VkBufferCreateInfo bufferInfo{};
bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bufferInfo.size = size;
bufferInfo.usage = usage;
bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;

if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
throw std::runtime_error("failed to create buffer!");
}

VkMemoryRequirements memRequirements;
vkGetBufferMemoryRequirements(device, buffer, &memRequirements);

VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = memRequirements.size;
allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties);

if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
throw std::runtime_error("failed to allocate buffer memory!");
}

vkBindBufferMemory(device, buffer, bufferMemory, 0);
}

而stagingBuffer到vertexBuffer的传输,被封装到了下面的函数中。
调用vkCmdCopyBuffer实现,但是这是一个commandBuffer,所以需要command record以及command submit到队列中执行

1
2
3
4
5
6
7
8
9
void copyBuffer(VkBuffer srcBuffer, VkBuffer dstBuffer, VkDeviceSize size) {
VkCommandBuffer commandBuffer = beginSingleTimeCommands();

VkBufferCopy copyRegion{};
copyRegion.size = size;
vkCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, 1, &copyRegion);

endSingleTimeCommands(commandBuffer);
}

首先需要分配命令,并启动录制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
VkCommandBuffer beginSingleTimeCommands() {
VkCommandBufferAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
allocInfo.commandPool = commandPool;
allocInfo.commandBufferCount = 1;

VkCommandBuffer commandBuffer;
vkAllocateCommandBuffers(device, &allocInfo, &commandBuffer);

VkCommandBufferBeginInfo beginInfo{};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

vkBeginCommandBuffer(commandBuffer, &beginInfo);

return commandBuffer;
}

结束录制并提交,最后注意等待命令执行完毕后,再释放掉commandBuffer。这里直接用vkQueueWaitIdle,而不是semaphore

1
2
3
4
5
6
7
8
9
10
11
12
13
void endSingleTimeCommands(VkCommandBuffer commandBuffer) {
vkEndCommandBuffer(commandBuffer);

VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &commandBuffer;

vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE);
vkQueueWaitIdle(graphicsQueue);

vkFreeCommandBuffers(device, commandPool, 1, &commandBuffer);
}

最后销毁stagingBuffer并释放内存stagingBufferMemory

显示 Gitment 评论