Vulkan之barrierExample

资料来源:https://github.com/KhronosGroup/Vulkan-Docs/wiki/Synchronization-Examples

vulkan的Barrier是个很复杂的概念,相对于其他的同步原语,使用难度较高,因此khronos官网提供了一些用例来指导Barrier的使用,下面我们来翻译分析一下

Compute to Compute Dependencies

  • 第一个dispatch写buffer,第二个dispatch读buffer,则设置如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT };

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

vkCmdDispatch(...);
  • 第一个dispatch读buffer,第二个dispatch写buffer,这种场景下不存在读写同步问题,所以设置一个execution barrier就行了,如下所示:
1
2
3
4
5
6
7
8
9
vkCmdDispatch(...);

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
...);

vkCmdDispatch(...);
  • 第一个dispatch写storage image,第二个dispatch读storage image,则设置如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
vkCmdDispatch(...);

// Storage image to storage image dependencies are always in GENERAL layout; no need for a layout transition
VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

vkCmdDispatch(...);
  • 三个dispatch,第一个dispatch写storage image区域,第二个dispatch写同一个storage image的不重叠的区域,第三个dispatch读所有的区域
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
vkCmdDispatch(...);
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT };

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

vkCmdDispatch(...);
  • 三个dispatch,第一个dispatch写storage image,第二个dispatch写另一个storage image,第三个dispatch读所有的storage image
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
vkCmdDispatch(...);
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT };

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

vkCmdDispatch(...);

上面我们用到的是pMemoryBarriers,这个是全局的内存屏蔽。全局内存屏障覆盖所有资源。 通常认为执行全局内存屏障比按资源屏障更有效,按资源屏障通常应用于队列所有权转移和图像布局转换,否则应该使用全局屏障

Compute to Graphics Dependencies

当计算管线和图形管线对同一资源进行读写时,barrier的设置如下。(实际上对于图形管线的资源交互,最好是用subpass的dependencies来解决,而不是用barrier)

  • dispatch写storage buffer,draw读取buffer用作index buffer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDEX_READ_BIT };

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

... // Render pass setup etc.

vkCmdDraw(...);
  • dispatch写storage buffer,draw读取buffer用作index buffer,另一个dispatch读storage用作uniformBuffer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
vkCmdDispatch(...);

// Batch barriers where possible if it doesn't change how synchronization takes place
VkMemoryBarrier memoryBarrier1 = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier1, // pMemoryBarriers
...);

... // Render pass setup etc.

vkCmdDraw(...);

... // Render pass teardown etc.

vkCmdDispatch(...);
  • dispatch写storage buffer,draw读取buffer用作indirect buffer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT };

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

... // Render pass setup etc.

vkCmdDrawIndirect(...);
  • dispatch写storage image,draw再fragment shader中对storage image采样

    注意这里用到了pImageMemoryBarriers,并且做了layout转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
vkCmdDispatch(...);

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);


... // Render pass setup etc.

vkCmdDraw(...);
  • dispatch写storage texel buffer,draw读取buffer用于indirect buffer,然后将buffer用作uniformBuffer在fragment shader中被读取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
vkCmdDispatch(...);

VkMemoryBarrier memoryBarrier = {
...
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
1, // memoryBarrierCount
&memoryBarrier, // pMemoryBarriers
...);

... // Render pass setup etc.

vkCmdDrawIndirect(...);

Graphics to Compute Dependencies

  • draw写入color attachment,dispatch对这个image采样

    color attachment write(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT)是一个单独的stageMask,它不属于fragment shader(VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
vkCmdDraw(...);

... // Render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

vkCmdDispatch(...);
  • draw写depth attachment, dispatch对这个image采样
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
vkCmdDraw(...);

... // Render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, // srcStageMask
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

vkCmdDispatch(...);

Graphics to Graphics Dependencies

在同一个renderpass内,渲染管线到渲染管线的依赖关系一般由subpass的dependency来解决,这比用barrier更加高效。

下面的这个例子都是以subpass依赖的形式表示

  • 第一个draw写depth attachement,第二个draw在fragment shader中将其作为input attachement读取

    下面用到了image layout的转换,在renderpass中,会自动进行从VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMALVK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL的转换,不需要用barrier显示转换。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
// Set this to the index in VkRenderPassCreateInfo::pAttachments where the depth image is described.
uint32_t depthAttachmentIndex = ...;

VkSubpassDescription subpasses[2];

VkAttachmentReference depthAttachment = {
.attachment = depthAttachmentIndex,
.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL};

// Subpass containing first draw
subpasses[0] = {
...
.pDepthStencilAttachment = &depthAttachment,
...};

VkAttachmentReference depthAsInputAttachment = {
.attachment = depthAttachmentIndex,
.layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL};

// Subpass containing second draw
subpasses[1] = {
...
.inputAttachmentCount = 1,
.pInputAttachments = &depthAsInputAttachment,
...};

VkSubpassDependency dependency = {
.srcSubpass = 0,
.dstSubpass = 1,
.srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT,
.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT};

// If initialLayout does not match the layout of the attachment reference in the first subpass, there will be an implicit transition before starting the render pass.
// If finalLayout does not match the layout of the attachment reference in the last subpass, there will be an implicit transition at the end.
VkAttachmentDescription depthFramebufferAttachment = {
...
.initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL};

VkRenderPassCreateInfo renderPassCreateInfo = {
...
.attachmentCount = 1,
.pAttachments = &depthFramebufferAttachment,
.subpassCount = 2,
.pSubpasses = subpasses,
.dependencyCount = 1,
.pDependencies = &dependency};

vkCreateRenderPass(...);

...
  • 第一个draw写depth attachement,第二个draw在fragment shader中将其作为depth image进行采样

    上面的例子是用subpass dependency,这里用Barrier解决

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
vkCmdDraw(...);

... // First render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, // srcStageMask
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

... // Second render pass setup etc.

vkCmdDraw(...);
  • 第一个draw写color attachement,第二个draw在fragment shader中将其作为input attachement读取,这里是subpass dependencies解决
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
// Set this to the index in VkRenderPassCreateInfo::pAttachments where the color image is described.
uint32_t colorAttachmentIndex = ...;

VkSubpassDescription subpasses[2];

VkAttachmentReference colorAttachment = {
.attachment = colorAttachmentIndex,
.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

// Subpass containing first draw
subpasses[0] = {
...
.colorAttachmentCount = 1,
.pColorAttachments = &colorAttachment,
...};

VkAttachmentReference colorAsInputAttachment = {
.attachment = colorAttachmentIndex,
.layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL};

// Subpass containing second draw
subpasses[1] = {
...
.inputAttachmentCount = 1,
.pInputAttachments = &colorAsInputAttachment,
...};

VkSubpassDependency dependency = {
.srcSubpass = 0,
.dstSubpass = 1,
.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT,
.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT};

// If initialLayout does not match the layout of the attachment reference in the first subpass, there will be an implicit transition before starting the render pass.
// If finalLayout does not match the layout of the attachment reference in the last subpass, there will be an implicit transition at the end.
VkAttachmentDescription colorFramebufferAttachment = {
...
.initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
.finalLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL};

VkRenderPassCreateInfo renderPassCreateInfo = {
...
.attachmentCount = 1,
.pAttachments = &colorFramebufferAttachment,
.subpassCount = 2,
.pSubpasses = subpasses,
.dependencyCount = 1,
.pDependencies = &dependency};

vkCreateRenderPass(...);

...
  • 第一个draw写color attachment, 第二个draw在fragment shader阶段将其作为color image的采样器,这里用barrier解决
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
vkCmdDraw(...);

... // First render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

... // Second render pass setup etc.

vkCmdDraw(...);
  • 第一个draw写color attachment, 第二个draw在vertex shader阶段将其作为color image的采样器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
vkCmdDraw(...);

... // First render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

... // Second render pass setup etc.

vkCmdDraw(...);
  • 第一个draw是在fragment shader阶段将纹理图像进行采样,第二个draw将这个纹理图像作为color attachment向其写入内容

    对于这种读后写的问题,正常来讲只需要一个execution barrier就可以了。但是因为是图像,涉及到了图像的布局转换,所以这里要用imageMemoryBarrier

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
vkCmdDraw(...);

... // First render pass teardown etc.

VkImageMemoryBarrier imageMemoryBarrier = {
...
.srcAccessMask = 0,
.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
/* .image and .subresourceRange should identify image subresource accessed */};

vkCmdPipelineBarrier(
...
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // srcStageMask
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // dstStageMask
...
1, // imageMemoryBarrierCount
&imageMemoryBarrier, // pImageMemoryBarriers
...);

... // Second render pass setup etc.

vkCmdDraw(...);
  • 第一个renderpass写depth attachment,第二个renderpass重用这个depth attachment

    这个一个写后写(write-after-write)的问题。因此它一直需要memory dependency,即使render pass没有从上一个render pass读取输出结果(实际上,在下面的例子中,由于我们将图像的初始布局定义为了undefined,这就天然的决定了我们无法保留上一个render pass生成的output),我们依旧需要一个memory dependency来确保写入图像的顺序。

    此外,由于我们使用的是自动布局转换(initialLayout 与布局不同),因此确保转换不会过早发生是很重要的。这通常需要明确指定 VK_SUBPASS_EXTERNAL 子通道依赖项,因为默认的隐式依赖项(具有 srcStageMask = TOP)是不够的。

    下面的示例使用 VK_SUBPASS_EXTERNAL 子通道依赖项来实现这两个目标(解决 write-after-write 危险,并阻止自动布局转换),当然这也可以使用pipeline-barrier解决。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// We're using the depth buffer as a depth-stencil attachment
VkAttachmentReference depthAttachment = {
.attachment = 0,
.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL};

VkAttachmentDescription depthFramebufferAttachment = {
...
.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, // Want to clear the buffer at the start of the subpass
.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, // No need to preserve previous image contents
.finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL // When done, leave it in the layout used in the subpass (no transition at the end)
};

// Subpass using the depth-buffer
VkSubpassDescription subpass = {
...
.pDepthStencilAttachment = &depthAttachment,
...};

// Use an incoming subpass-dependency to ensure:
// * Previous use of the depth-buffer is complete (execution dependency).
// * WAW hazard is resolved (e.g. caches are flushed and invalidated so old and new writes are not re-ordered).
// * Transition from UNDEFINED -> VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL happens-after previous `EARLY/LATE_FRAGMENT_TESTS` use.
// * Changes made to the image by the transition are accounted for by setting the appropriate dstAccessMask.
VkSubpassDependency dependency = {
.srcSubpass = VK_SUBPASS_EXTERNAL,
.dstSubpass = 0,
.srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, // Both stages might have access the depth-buffer, so need both in src/dstStageMask
.dstStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT
.dependencyFlags = 0};

VkRenderPassCreateInfo renderPassCreateInfo = {
...
.attachmentCount = 1,
.pAttachments = &depthFramebufferAttachment,
.subpassCount = 1,
.pSubpasses = &subpass
.dependencyCount = 1,
.pDependencies = &dependency};

vkCreateRenderPass(...);

...

# First render-pass
vkCmdBeginRenderPass();
...
vkCmdEndRenderPass();

...

# Second render-pass, could be the same or a different frame
vkCmdBeginRenderPass();
...
vkCmdEndRenderPass();

Transfer Dependencies

  • 从CPU上传数据到vertex buffer

    • host memory与device memory不是一体的(discrete)(比如独显的PC)

      Setup:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      // Data and size of that data
      const uint32_t vertexDataSize = ... ;
      const void* pData = ... ;

      // Create a staging buffer for upload
      VkBufferCreateInfo stagingCreateInfo = {
      ...
      .size = vertexDataSize,
      .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
      ... };

      VkBuffer stagingBuffer;
      vkCreateBuffer(device, &stagingCreateInfo, NULL, &stagingBuffer);

      // Create the vertex buffer
      VkBufferCreateInfo vertexCreateInfo = {
      ...
      .size = vertexDataSize,
      .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
      ... };

      VkBuffer vertexBuffer;
      vkCreateBuffer(device, &vertexCreateInfo, NULL, &vertexBuffer);

      ...

      // Allocate and memory bind memory for these buffers.
      // Ensure that the staging buffer uses a memory type that has
      // VK_MEMORY_PROPERTY_HOST_VISIBLE property and doesn't have
      // VK_MEMORY_PROPERTY_DEVICE_LOCAL.
      // The vertex buffer memory should be the opposite - it should include
      // VK_MEMORY_PROPERTY_DEVICE_LOCAL and should not have
      // VK_MEMORY_PROPERTY_HOST_VISIBLE.
      // Use the example code documented in the description of
      // VkPhysicalDeviceMemoryProperties:
      // https://www.khronos.org/registry/vulkan/specs/1.0/man/html/VkPhysicalDeviceMemoryProperties.html

      ...

      // Map the staging buffers - if you plan to re-use these (which you should),
      // keep them mapped.
      // Ideally just map the whole range at once as well.

      void* stagingData;

      vkMapMemory(
      ...
      stagingMemory,
      stagingMemoryOffset,
      vertexDataSize,
      0,
      &stagingData);

      // Write data directly into the mapped pointer
      fread(stagingData, vertexDataSize, 1, vertexFile);

      // Flush the memory range
      // If the memory type of stagingMemory includes VK_MEMORY_PROPERTY_HOST_COHERENT, skip this step

      // Align to the VkPhysicalDeviceProperties::nonCoherentAtomSize
      uint32_t alignedSize = (vertexDataSize-1) - ((vertexDataSize-1) % nonCoherentAtomSize) + nonCoherentAtomSize;

      // Setup the range
      VkMappedMemoryRange stagingRange = {
      ...
      .memory = stagingMemory,
      .offset = stagingMemoryOffset,
      .size = alignedSize};

      // Flush the range
      vkFlushMappedMemoryRanges(device, 1, &stagingRange);

      录制commandBuffer,并且提交submission

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
      100
      vkBeginCommandBuffer(...);

      // Submission guarantees the host write being complete, as per
      // https://www.khronos.org/registry/vulkan/specs/1.0/html/vkspec.html#synchronization-submission-host-writes
      // So no need for a barrier before the transfer

      // Copy the staging buffer contents to the vertex buffer
      VkBufferCopy vertexCopyRegion = {
      .srcOffset = stagingMemoryOffset,
      .dstOffset = vertexMemoryOffset,
      .size = vertexDataSize};

      vkCmdCopyBuffer(
      commandBuffer,
      stagingBuffer,
      vertexBuffer,
      1,
      &vertexCopyRegion);


      // If the graphics queue and transfer queue are the same queue
      if (isUnifiedGraphicsAndTransferQueue)
      {
      // If there is a semaphore signal + wait between this being submitted and
      // the vertex buffer being used, then skip this pipeline barrier.

      // Pipeline barrier before using the vertex data
      // Note that this can apply to all buffers uploaded in the same way, so
      // ideally batch all copies before this.
      VkMemoryBarrier memoryBarrier = {
      ...
      .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_TRANSFER_BIT , // srcStageMask
      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, // dstStageMask
      1, // memoryBarrierCount
      &memoryBarrier, // pMemoryBarriers
      ...);


      vkEndCommandBuffer(...);

      vkQueueSubmit(unifiedQueue, ...);
      }
      else
      {
      // Pipeline barrier to start a queue ownership transfer after the copy
      VkBufferMemoryBarrier bufferMemoryBarrier = {
      ...
      .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .dstAccessMask = 0,
      .srcQueueFamilyIndex = transferQueueFamilyIndex,
      .dstQueueFamilyIndex = graphicsQueueFamilyIndex,
      .buffer = vertexBuffer,
      ...};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_TRANSFER_BIT , // srcStageMask
      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, // dstStageMask
      1, // bufferMemoryBarrierCount
      &bufferMemoryBarrier, // pBufferMemoryBarriers
      ...);


      vkEndCommandBuffer(...);

      // Ensure a semaphore is signalled here which will be waited on by the graphics queue.
      vkQueueSubmit(transferQueue, ...);

      // Record a command buffer for the graphics queue.
      vkBeginCommandBuffer(...);

      // Pipeline barrier before using the vertex buffer, after finalising the ownership transfer
      VkBufferMemoryBarrier bufferMemoryBarrier = {
      ...
      .srcAccessMask = 0,
      .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
      .srcQueueFamilyIndex = transferQueueFamilyIndex,
      .dstQueueFamilyIndex = graphicsQueueFamilyIndex,
      .buffer = vertexBuffer,
      ...};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // srcStageMask
      VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, // dstStageMask
      ...
      1, // bufferMemoryBarrierCount
      &bufferMemoryBarrier, // pBufferMemoryBarriers
      ...);


      vkEndCommandBuffer(...);

      vkQueueSubmit(graphicsQueue, ...);
      }
    • host memory与device memory是一体的(UMA)(比如手机)

      对于UMA的系统,不需要使用staging的方式。

      Setup:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      // Data and size of that data
      const uint32_t vertexDataSize = ... ;
      const void* pData = ... ;

      // Create the vertex buffer
      VkBufferCreateInfo vertexCreateInfo = {
      ...
      .size = vertexDataSize,
      .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
      ... };

      VkBuffer vertexBuffer;
      vkCreateBuffer(device, &vertexCreateInfo, NULL, &vertexBuffer);

      ...

      // Allocate and memory bind memory for this buffer.
      // It should use a memory type that includes HOST_VISIBLE, and ideally also
      // DEVICE_LOCAL if available.
      // Use the example code documented in the description of
      // VkPhysicalDeviceMemoryProperties:
      // https://www.khronos.org/registry/vulkan/specs/1.0/man/html/VkPhysicalDeviceMemoryProperties.html

      ...

      // Map the vertex buffer

      void* vertexData;

      vkMapMemory(
      ...
      vertexMemory,
      vertexMemoryOffset,
      vertexDataSize,
      0,
      &vertexData);

      // Write data directly into the mapped pointer
      fread(vertexData, vertexDataSize, 1, vertexFile);

      // Flush the memory range
      // If the memory type of vertexMemory includes VK_MEMORY_PROPERTY_HOST_COHERENT, skip this step

      // Align to the VkPhysicalDeviceProperties::nonCoherentAtomSize
      uint32_t alignedSize = (vertexDataSize-1) - ((vertexDataSize-1) % nonCoherentAtomSize) + nonCoherentAtomSize;

      // Setup the range
      VkMappedMemoryRange vertexRange = {
      ...
      .memory = vertexMemory,
      .offset = vertexMemoryOffset,
      .size = alignedSize};

      // Flush the range
      vkFlushMappedMemoryRanges(device, 1, &vertexRange);

      // You may want to skip this if you're going to modify the
      // data again
      vkUnmapMemory(device, vertexMemory);

      录制commandBuffer,并且提交submission,这个流程与上面是一致的,这里不再重复。

  • 从CPU上传图像,用作在fragment shader阶段做采样用

    对于UMA和discrete系统,下面的流程都是一样的,需要做layout的转换

    Setup:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    // Data and size of that data
    const uint32_t imageDataSize = ... ;

    // Create a staging buffer for upload
    VkBufferCreateInfo stagingCreateInfo = {
    ...
    .size = imageDataSize,
    .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
    ... };

    VkBuffer stagingBuffer;
    vkCreateBuffer(device, &stagingCreateInfo, NULL, &stagingBuffer);

    // Create the sampled image
    VkImageCreateInfo imageCreateInfo = {
    ...
    // Set the dimensions for the image as appropriate
    .tiling = VK_IMAGE_TILING_OPTIMAL,
    .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
    ... };

    VkImage image;
    vkCreateImage(device, &imageCreateInfo, NULL, &image);

    ...

    // Allocate and memory bind memory for these resources.
    // Ensure that the staging buffer uses a memory type that has
    // VK_MEMORY_PROPERTY_HOST_VISIBLE property and doesn't have
    // VK_MEMORY_PROPERTY_DEVICE_LOCAL.
    // The image memory should be the opposite - it should include
    // VK_MEMORY_PROPERTY_DEVICE_LOCAL and should not have
    // VK_MEMORY_PROPERTY_HOST_VISIBLE.
    // Use the example code documented in the description of
    // VkPhysicalDeviceMemoryProperties:
    // https://www.khronos.org/registry/vulkan/specs/1.0/man/html/VkPhysicalDeviceMemoryProperties.html

    ...

    // Map the staging buffers - if you plan to re-use these (which you should),
    // keep them mapped.
    // Ideally just map the whole range at once as well.

    void* stagingData;

    vkMapMemory(
    ...
    stagingMemory,
    stagingMemoryOffset,
    imageDataSize,
    0,
    &stagingData);

    // Write data directly into the mapped pointer
    fread(stagingData, imageDataSize, 1, imageFile);

    // Flush the memory range
    // If the memory type of stagingMemory includes VK_MEMORY_PROPERTY_HOST_COHERENT, skip this step

    // Align to the VkPhysicalDeviceProperties::nonCoherentAtomSize
    uint32_t alignedSize = (imageDataSize-1) - ((imageDataSize-1) % nonCoherentAtomSize) + nonCoherentAtomSize;

    // Setup the range
    VkMappedMemoryRange stagingRange = {
    ...
    .memory = stagingMemory,
    .offset = stagingMemoryOffset,
    .size = alignedSize};

    // Flush the range
    vkFlushMappedMemoryRanges(device, 1, &stagingRange);

    录制命令并提交submission

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    vkBeginCommandBuffer(...);

    // Submission guarantees the host write being complete, as per
    // https://www.khronos.org/registry/vulkan/specs/1.0/html/vkspec.html#synchronization-submission-host-writes
    // So no need for a barrier before the transfer for that purpose, but one is
    // required for the image layout changes.

    // Pipeline barrier before the copy to perform a layout transition
    VkImageMemoryBarrier preCopyMemoryBarrier = {
    ...
    .srcAccessMask = 0,
    .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
    .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
    .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
    .image = image,
    .subresourceRange = ... }; // Transition as much of the image as you can at once.

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // srcStageMask
    VK_PIPELINE_STAGE_TRANSFER_BIT, // dstStageMask
    ...
    1, // imageMemoryBarrierCount
    &preCopyMemoryBarrier, // pImageMemoryBarriers
    ...);


    // Setup copies for the all regions required (should be batched into a single call where possible)
    vkCmdCopyBufferToImage(
    commandBuffer,
    stagingBuffer,
    image,
    ... };

    // If the graphics queue and transfer queue are the same queue
    if (isUnifiedGraphicsAndTransferQueue)
    {
    // Pipeline barrier before using the vertex data
    VkImageMemoryBarrier postCopyMemoryBarrier = {
    ...
    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
    .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
    .newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
    .image = image,
    .subresourceRange = ... }; // Transition as much of the image as you can at once.

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_TRANSFER_BIT , // srcStageMask
    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
    ...
    1, // imageMemoryBarrierCount
    &postCopyMemoryBarrier, // pImageMemoryBarriers
    ...);

    vkEndCommandBuffer(...);

    vkQueueSubmit(unifiedQueue, ...);
    }
    else
    {
    // Pipeline barrier before using the vertex data
    VkImageMemoryBarrier postCopyTransferMemoryBarrier = {
    ...
    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
    .dstAccessMask = 0,
    .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
    .newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
    .srcQueueFamilyIndex = transferQueueFamilyIndex,
    .dstQueueFamilyIndex = graphicsQueueFamilyIndex,
    .image = image,
    .subresourceRange = ... }; // Transition as much of the image as you can at once.

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_TRANSFER_BIT , // srcStageMask
    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, // dstStageMask
    ...
    1, // imageMemoryBarrierCount
    &postCopyTransferMemoryBarrier, // pImageMemoryBarriers
    ...);

    vkEndCommandBuffer(...);

    vkQueueSubmit(transferQueue, ...);

    vkBeginCommandBuffer(...);

    // Pipeline barrier before using the vertex data
    VkImageMemoryBarrier postCopyGraphicsMemoryBarrier = {
    ...
    .srcAccessMask = 0,
    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
    .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
    .newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
    .srcQueueFamilyIndex = transferQueueFamilyIndex,
    .dstQueueFamilyIndex = graphicsQueueFamilyIndex,
    .image = image,
    .subresourceRange = ... }; // Transition as much of the image as you can at once.

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // srcStageMask
    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask
    ...
    1, // imageMemoryBarrierCount
    &postCopyGraphicsMemoryBarrier, // pImageMemoryBarriers
    ...);

    vkEndCommandBuffer(...);

    vkQueueSubmit(graphicsQueue, ...);
    }
  • CPU回读compute shader写入的数据

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    vkCmdDispatch(...);

    VkMemoryBarrier memoryBarrier = {
    ...
    VK_ACCESS_SHADER_WRITE_BIT, // srcAccessMask
    VK_ACCESS_HOST_READ_BIT}; // dstAccessMask

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, // srcStageMask
    VK_PIPELINE_STAGE_HOST_BIT, // dstStageMask
    1, // memoryBarrierCount
    &memoryBarrier, // pMemoryBarriers
    ...);

    vkEndCommandBuffer(...);

    vkQueueSubmit(..., fence); // Submit the command buffer with a fence

    GPU需要时间来处理,所以这应该与其他资源管理(例如交换链图像)一起流水线化

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    vkWaitForFences(fence);

    // If the memory is host coherent, skip this step - otherwise invalidation is necessary
    if (memoryIsNotHostCoherent) {
    VkMappedMemoryRange mappedMemoryRange = {
    ...
    mappedMemory, // Mapped pointer to the VkDeviceMemory allocation backing the buffer.
    ...
    } VkMappedMemoryRange;

    vkInvalidateMappedMemoryRanges(..., 1, &mappedMemoryRange);
    }

    // Read values back from the mapped pointer
    value = mappedMemory[...];

Interactions with semaphores

如果正在同步的两个命令在它们之间有一个Semaphore,那么就可以减少或删除由pipeline-barrier/event/subpass dependency完成的额外同步。 仅列出受semaphore依赖性影响的参数即可。

发出信号量等待所有阶段完成,并且所有内存访问都自动可用。 类似地,等待信号量将使所有内存访问可用,并阻止进一步的工作,直到它发出信号。 请注意,在 QueueSubmit 的情况下,有一组明确的阶段来防止在 VkSubmitInfo::pWaitDstStageMask 中运行 - 对于所有其他信号量使用,所有工作的执行都被阻止。

  • image之间的依赖需要做layout转换,需要在semaphore之后

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    vkCmdDispatch(...);

    VkImageMemoryBarrier imageMemoryBarrier = {
    ...
    .dstAccessMask = 0};

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, // dstStageMask
    ...);

    ... // Semaphore signal/wait happens here

    vkCmdDispatch(...);
  • image之间的依赖需要做layout转换,在semaphore之前

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    vkCmdDispatch(...);

    ... // Semaphore signal/wait happens here

    VkImageMemoryBarrier imageMemoryBarrier = {
    ...
    .srcAccessMask = 0};

    vkCmdPipelineBarrier(
    ...
    VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // srcStageMask
    ...);


    vkCmdDispatch(...);

    srcAccessMask 中使用的阶段必须: 等于(或逻辑上晚于)为相关信号量等待操作定义的VkSubmitInfo::pWaitDstStageMask 中的值,否则不能保证在信号量等待之后出现屏障。 在这个例子中,我们假设相关的 pWaitDstStageMask值等于VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT

Swapchain Image Acquire and Present

  • Combined Graphics/Present Queue
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    VkAttachmentReference attachmentReference = {
    .attachment = 0,
    .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

    // Subpass containing first draw
    VkSubpassDescription subpass = {
    ...
    .colorAttachmentCount = 1,
    .pColorAttachments = &attachmentReference,
    ...};

    /* Only need a dependency coming in to ensure that the first
    layout transition happens at the right time.
    Second external dependency is implied by having a different
    finalLayout and subpass layout. */
    VkSubpassDependency dependency = {
    .srcSubpass = VK_SUBPASS_EXTERNAL,
    .dstSubpass = 0,
    // .srcStageMask needs to be a part of pWaitDstStageMask in the WSI semaphore.
    .srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
    .dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
    .srcAccessMask = 0,
    .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
    .dependencyFlags = 0};

    /* Normally, we would need an external dependency at the end as well since we are changing layout in finalLayout,
    but since we are signalling a semaphore, we can rely on Vulkan's default behavior,
    which injects an external dependency here with
    dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
    dstAccessMask = 0. */

    VkAttachmentDescription attachmentDescription = {
    ...
    .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
    .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
    ...
    // The image will automatically be transitioned from UNDEFINED to COLOR_ATTACHMENT_OPTIMAL for rendering, then out to PRESENT_SRC_KHR at the end.
    .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
    // Presenting images in Vulkan requires a special layout.
    .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR};

    VkRenderPassCreateInfo renderPassCreateInfo = {
    ...
    .attachmentCount = 1,
    .pAttachments = &attachmentDescription,
    .subpassCount = 1,
    .pSubpasses = &subpass,
    .dependencyCount = 1,
    .pDependencies = &dependency};

    vkCreateRenderPass(...);

    ...

    vkAcquireNextImageKHR(
    ...
    acquireSemaphore, //semaphore
    ...
    &imageIndex); //image index

    VkPipelineStageFlags waitDstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;

    VkSubmitInfo submitInfo = {
    ...
    .waitSemaphoreCount = 1,
    .pWaitSemaphores = &acquireSemaphore,
    .pWaitDstStageMask = &waitDstStageMask,
    ...
    .signalSemaphoreCount = 1,
    .pSignalSemaphores = &graphicsSemaphore};

    vkQueueSubmit(..., &submitInfo, ...);

    VkPresentInfoKHR presentInfo = {
    .waitSemaphoreCount = 1,
    .pWaitSemaphores = &graphicsSemaphore,
    ...};

    vkQueuePresentKHR(..., &presentInfo);
  • Multiple Queues

    如果present Queue和render Queue不是同一个Queue的话,那么就要在两个Queue之间做所有权转移,这就需要额外的同步操作

    • Render pass setup:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      VkAttachmentReference attachmentReference = {
      .attachment = 0,
      .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

      // Subpass containing first draw
      VkSubpassDescription subpass = {
      ...
      .colorAttachmentCount = 1,
      .pColorAttachments = &attachmentReference,
      ...};

      VkAttachmentDescription attachmentDescription = {
      ...
      .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
      .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
      ...
      .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
      .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

      /* Due to these necessary extra synchronization points, it makes more sense
      to omit the sub pass external dependencies (which can't express a queue
      transfer), and batch the relevant operations with the new pipeline
      barriers we're introducing. */

      VkRenderPassCreateInfo renderPassCreateInfo = {
      ...
      .attachmentCount = 1,
      .pAttachments = &attachmentDescription,
      .subpassCount = 1,
      .pSubpasses = &subpass,
      .dependencyCount = 0,
      .pDependencies = NULL};

      vkCreateRenderPass(...);
    • Rendering command buffer - graphics queue:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      /* Queue ownership transfer is only required when we need the content to remain valid across queues.
      Since we are transitioning from UNDEFINED -- and therefore discarding the image contents to begin with --
      we are not required to perform an ownership transfer from the presentation queue to graphics.

      This transition could also be made as an EXTERNAL -> subpass #0 render pass dependency as shown earlier. */

      VkImageMemoryBarrier imageMemoryBarrier = {
      ...
      .srcAccessMask = 0,
      .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
      .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
      .newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      /* .image and .subresourceRange should identify image subresource accessed */};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask
      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // dstStageMask
      ...
      1, // imageMemoryBarrierCount
      &imageMemoryBarrier, // pImageMemoryBarriers
      ...);


      ... // Render pass submission.

      // Queue release operation. dstAccessMask should always be 0.
      VkImageMemoryBarrier imageMemoryBarrier = {
      ...
      .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
      .dstAccessMask = 0,
      .oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
      .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
      .srcQueueFamilyIndex = graphicsQueueFamilyIndex, // index of the graphics queue family
      .dstQueueFamilyIndex = presentQueueFamilyIndex, // index of the present queue family
      /* .image and .subresourceRange should identify image subresource accessed */};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask
      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, // dstStageMask
      ...
      1, // imageMemoryBarrierCount
      &imageMemoryBarrier, // pImageMemoryBarriers
      ...);
    • Pre-present commands - presentation queue:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      // After submitting the render pass...
      VkImageMemoryBarrier imageMemoryBarrier = {
      ...
      .srcAccessMask = 0,
      .dstAccessMask = 0,
      // A layout transition which happens as part of an ownership transfer needs to be specified twice one for the release, and one for the acquire.
      // No srcAccessMask is needed, waiting for a semaphore does that automatically.
      // No dstAccessMask is needed, signalling a semaphore does that automatically.
      .oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
      .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
      .srcQueueFamilyIndex = graphicsQueueFamilyIndex, // index of the graphics queue family
      .dstQueueFamilyIndex = presentQueueFamilyIndex, // index of the present queue family
      /* .image and .subresourceRange should identify image subresource accessed */};

      vkCmdPipelineBarrier(
      ...
      VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // srcStageMask
      VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, // dstStageMask
      ...
      1, // imageMemoryBarrierCount
      &imageMemoryBarrier, // pImageMemoryBarriers
      ...);
    • Queue submission:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      vkAcquireNextImageKHR(
      ...
      acquireSemaphore, //semaphore
      ...
      &imageIndex); //image index

      VkPipelineStageFlags waitDstStageMask1 = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
      VkSubmitInfo submitInfo1 = {
      ...
      .waitSemaphoreCount = 1,
      .pWaitSemaphores = &acquireSemaphore,
      .pWaitDstStageMask = &waitDstStageMask1,
      .commandBufferCount = 1,
      .pCommandBuffers = &renderingCommandBuffer,
      .signalSemaphoreCount = 1,
      .pSignalSemaphores = &graphicsSemaphore};

      vkQueueSubmit(renderQueue, &submitInfo1, ...);

      VkPipelineStageFlags waitDstStageMask2 = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
      VkSubmitInfo submitInfo2 = {
      ...
      .waitSemaphoreCount = 1,
      .pWaitSemaphores = &graphicsSemaphore,
      .pWaitDstStageMask = &waitDstStageMask2,
      .commandBufferCount = 1,
      .pCommandBuffers = &prePresentCommandBuffer,
      .signalSemaphoreCount = 1,
      .pSignalSemaphores = &ownershipPresentSemaphore};

      vkQueueSubmit(presentQueue, &submitInfo2, ...);

      VkPresentInfoKHR presentInfo = {
      .waitSemaphoreCount = 1,
      .pWaitSemaphores = &ownershipPresentSemaphore,
      ...};

      vkQueuePresentKHR(..., &presentInfo);

References

https://github.com/KhronosGroup/Vulkan-Docs/wiki/Synchronization-Examples

显示 Gitment 评论