为什么使用Compute Shader计算高斯模糊

在计算高斯模糊时,因为计算卷积的关系,会重复采样不少像素,所以这一期间效率是很低的。而在GPU中有个玩意叫做共享内存,线程访问共享内存的效率是远远高于采样贴图的效率的,因此本文将介绍如何在Compute Shader使用Group Shared Memory加速高斯模糊

实现步骤

  • 实现高斯模糊的基本步骤这里就不细说了,重点介绍如何加速,及加速需注意的细节

  • 虽然Group Shared Memory会存储贴图信息,但它的大小是有限的,因此这里需要考虑最大的模糊半径(因为模糊半径越大,需要的Group Shared Memory也越大),这里设最大模糊半径MAX_RADIUS为32

  • 在Compute Shader采样,可以把每个线程理解为screen space uv,那么在模糊边界时,由于模糊半径_BlurRadius的影响,线程会可能越界。因此Group Shared Memory开辟的大小应是线程数 + 2 * _BlurRadius

    在实现时还需要注意处理越界问题(采样存储线程组外的贴图信息),越界后采样情况如下:

  • 为了优化高斯模糊,常见的是将二维高斯模糊拆分成两个一维的,水平竖直的都模糊一次;本文在此基础上还通过线性插值的手笔进一步优化,以及降低图片分辨率后再模糊,最后还原分辨率的方法

实现

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"

#pragma region Kernel

#pragma kernel GaussianBlurHorizontalMain
#pragma kernel GaussianBlurVerticalMain

#pragma endregion

#pragma region Declaration

float _BlurRadius;
float4 _ViewSize;

Texture2D<float4> _InputTex;
RWTexture2D<float4> _OutputTex;

static const float Gaussian17[] =
{
    0.00002611081194810,
    0.00021522769030413,
    0.00133919168719865,
    0.00628987509902766,
    0.02229954363469697,
    0.05967667338326389,
    0.12055019394312867,
    0.18381709484250766,
    0.21157217927735517,
    0.18381709484250766,
    0.12055019394312867,
    0.05967667338326389,
    0.02229954363469697,
    0.00628987509902766,
    0.00133919168719865,
    0.00021522769030413,
    0.00002611081194810,
};

// groupshared的大小有所限制, 这里设置模糊核半径为32
#define MAX_RADIUS 32

// 模糊会采样目标像素点两侧的像素, 因此groupshared不仅仅需要保存threads, 还需要保存额外的图像数据
groupshared float3 GS_Color[64 + 2 * MAX_RADIUS];

#pragma endregion

[numthreads(64,1,1)]
void GaussianBlurHorizontalMain (uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 dispatchThreadID : SV_DispatchThreadID)
{
    int2 currPos = dispatchThreadID.xy;
    int2 tempPos = clamp(currPos, 0, _ViewSize.xy - 1);
    GS_Color[groupIndex + MAX_RADIUS] = _InputTex.Load(uint3(tempPos, 0)).rgb;  // 将数据从全局内存(GPU的主内存)加载到共享内存中

    if(groupIndex < MAX_RADIUS)
    {
        int2 clampPos = max(tempPos - int2(MAX_RADIUS, 0), 0);
        GS_Color[groupIndex] = _InputTex.Load(uint3(clampPos, 0)).rgb;
    }
    if(groupIndex >= 64 - MAX_RADIUS)
    {
        int2 clampPos = min(tempPos + int2(MAX_RADIUS, 0), _ViewSize.xy - 1);
        GS_Color[groupIndex + 2 * MAX_RADIUS] = _InputTex.Load(uint3(clampPos, 0)).rgb;
    }
    GroupMemoryBarrierWithGroupSync();  // 线程同步

    float3 blurColor = 0.f;
    UNITY_UNROLL
    for(uint i = 0; i < 17; ++i)
    {
        float weight = Gaussian17[i];

        float offset = ((float)i - 8) * _BlurRadius * 0.125;
        int floorOffset = floor(offset);
        float lerpValue = offset - floorOffset;
        float3 sampleColorFloor = GS_Color[groupIndex + floorOffset + MAX_RADIUS];
        float3 sampleColorCeil  = GS_Color[groupIndex + floorOffset + MAX_RADIUS + 1];
        float3 sampleColor = lerp(sampleColorFloor, sampleColorCeil, lerpValue);
        blurColor += sampleColor * weight;
    }

    _OutputTex[dispatchThreadID.xy] = float4(blurColor, 1.f);
}

[numthreads(1,64,1)]
void GaussianBlurVerticalMain (uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex, uint3 dispatchThreadID : SV_DispatchThreadID)
{
    int2 currPos = dispatchThreadID.xy;
    int2 tempPos = clamp(currPos, 0, _ViewSize.xy - 1);
    GS_Color[groupIndex + MAX_RADIUS] = _InputTex.Load(uint3(tempPos, 0)).rgb;

    if(groupIndex < MAX_RADIUS)
    {
        int2 clampPos = max(tempPos - uint2(0, MAX_RADIUS), 0);
        GS_Color[groupIndex] = _InputTex.Load(uint3(clampPos, 0)).rgb;
    }

    if(groupIndex >= 64 - MAX_RADIUS)
    {
        int2 clampPos = min(tempPos + uint2(0, MAX_RADIUS), _ViewSize.xy - 1);
        GS_Color[groupIndex + 2 * MAX_RADIUS] = _InputTex.Load(uint3(clampPos, 0)).rgb;
    }
    GroupMemoryBarrierWithGroupSync();

    float3 blurColor = 0.f;
    UNITY_UNROLL
    for(uint i = 0; i < 17; ++i)
    {
        float weight = Gaussian17[i];

        float offset = ((float)i - 8) * 0.125f * _BlurRadius;
        int offsetFloor = floor(offset);
        float lerpValue = offset - offsetFloor;

        float3 sampleColorFloor = GS_Color[groupIndex + MAX_RADIUS + offsetFloor];
        float3 sampleColorCeil = GS_Color[groupIndex + MAX_RADIUS + offsetFloor + 1];
        float3 sampleColor = lerp(sampleColorFloor, sampleColorCeil, lerpValue);

        blurColor += sampleColor * weight;
    }

    _OutputTex[dispatchThreadID.xy] = float4(blurColor, 1.f);
}

后续

目前这个高斯模糊还存在一定问题,它的权重范围是固定的,后续还需考虑如何优化,不过会放置一段时间,先写TAA!

Reference

DirectX11 With Windows SDK--30 计算着色器:高斯模糊、索贝尔算子


他们曾如此骄傲的活过,贯彻始终