在GPU编程中,很多状态信息需要在不同的内核执行之间进行传递和保留,以便实现高效的并行计算和数据共享。IntegratorState意为积分器的状态,他作用就是可以在不同Kernel之间进行信息共享,IntegratorState不能占用太多内存,否则会影响执行效率,他的内存管理和分配由不同的硬件来决定。

IntegratorShadowStateCPU

typedef struct IntegratorShadowStateCPU {
#define KERNEL_STRUCT_BEGIN(name) struct {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
#define KERNEL_STRUCT_END(name) \
  } \
  name;
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
  name[cpu_size];
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
#include "kernel/integrator/shadow_state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
} IntegratorShadowStateCPU;
typedef struct IntegratorStateCPU {
#define KERNEL_STRUCT_BEGIN(name) struct {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
#define KERNEL_STRUCT_END(name) \
  } \
  name;
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
  name[cpu_size];
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
#include "kernel/integrator/state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
  IntegratorShadowStateCPU shadow;
  IntegratorShadowStateCPU ao;
} IntegratorStateCPU;

用于CPU渲染时的IntegratorState,这里不详细研究。

IntegratorStateGPU

typedef struct IntegratorStateGPU {
#define KERNEL_STRUCT_BEGIN(name) struct {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
#define KERNEL_STRUCT_END(name) \
  } \
  name;
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
  name[gpu_size];
#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
#include "kernel/integrator/state_template.h"
#include "kernel/integrator/shadow_state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
  /* Count number of queued kernels. */
  ccl_global IntegratorQueueCounter *queue_counter;
  /* Count number of kernels queued for specific shaders. */
  ccl_global int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
  /* Index of shadow path which will be used by a next shadow path.  */
  ccl_global int *next_shadow_path_index;
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;
  /* Partition/key offsets used when writing sorted active indices. */
  ccl_global int *sort_partition_key_offsets;
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
} IntegratorStateGPU;

KERNEL_STRUCT宏定义

在IntegratorState中出现了KERNEL_STRUCT_BEGIN这样的宏定义,而后又会将其undef ,它是一个辅助的宏,用来定义IntegratorState中的各种属性。

IntegratorState中这种定义数据的格式在项目中经常使用,首先定义宏:

#define KERNEL_STRUCT_BEGIN(name) struct {
#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) ccl_global type *name;
#define KERNEL_STRUCT_END(name) \
  } \
  name;

然后就可以用这个宏做结构体的定义:

KERNEL_STRUCT_BEGIN(path)
KERNEL_STRUCT_MEMBER(path, uint32_t, sample, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_END(path)

上面的代码就相当于:

struct {
    ccl_global uint32_t *sample;
} path;

这样在需要定义大量的结构体时,可以单独把他们放到另一个文件中,再引用过来。

在parent_struct, type, name, feature这几份宏定义的参数中,看起来feature是没有用到的,只起到一个标识作用。

以IntegratorStateGPU为例,它include了两个文件state_template.h和shadow_state_template.h。state_template.h中定义了的Path、Ray、Intersection、Subsurface等结构体类型,shadow_state_template.h定义了shadow_path、shadow_ray等结构体类型。

state_template.h文件的一些定义:

KERNEL_STRUCT_BEGIN(path)
/* Index of a pixel within the device render buffer where this path will write its result.
 * To get an actual offset within the buffer the value needs to be multiplied by the
 * `kernel_data.film.pass_stride`.
 *
 * The multiplication is delayed for later, so that state can use 32bit integer. */
KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
/* Current sample number. */
KERNEL_STRUCT_MEMBER(path, uint32_t, sample, KERNEL_FEATURE_PATH_TRACING)
/* Current ray bounce depth. */
KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
/* Current transparent ray bounce depth. */
KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
/* Current diffuse ray bounce depth. */
KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
/* Current glossy ray bounce depth. */
KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
...
KERNEL_STRUCT_END(path)

这里就是定义了一个path结构体,它的成员有render_pixel_index,sample,bounce等。

INTEGRATOR_STATE宏定义

#  define INTEGRATOR_STATE(state, nested_struct, member) \
    kernel_integrator_state.nested_struct.member[state]
#  define INTEGRATOR_STATE_WRITE(state, nested_struct, member) \
    INTEGRATOR_STATE(state, nested_struct, member)

#  define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
    kernel_integrator_state.nested_struct[array_index].member[state]
#  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
    INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)

这些宏定义是用来快速地读取或写入state中的属性的,比如想读取一个state的path中的sample属性:

sample = INTEGRATOR_STATE(state, path, sample)
// 等价于 sample = kernel_integrator_state.path.sample[state]

同理,写入此属性:

INTEGRATOR_STATE_WRITE(state, path, sample) = sample
// 等价于 kernel_integrator_state.path.sample[state] = sample

如果是数组成员的话加上一个array_index参数。

需要注意的是,IntegratorStateGPU和IntegratorStateCPU关于INTEGRATOR_STATE宏的定义是不一样的,IntegratorStateCPU中是这样定义的:

#  define INTEGRATOR_STATE(state, nested_struct, member) ((state)->nested_struct.member)
#  define INTEGRATOR_STATE_WRITE(state, nested_struct, member) ((state)->nested_struct.member)

#  define INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member) \
    ((state)->nested_struct[array_index].member)
#  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
    ((state)->nested_struct[array_index].member)

它的state是一个结构体,以上面读取path中的sample属性为例:

sample = INTEGRATOR_STATE(state, path, sample)
// 等价于 sample = ((state)->path.sample)

在GPU中,state是一个索引,从kernel_integrator_state中根据此索引取数据,在CPU中,state是一个完整的结构体,可以直接从它的内部取数据。

在CPU中,IntegratorState的定义是这样的:

typedef IntegratorStateCPU *ccl_restrict IntegratorState;
typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;
typedef IntegratorShadowStateCPU *ccl_restrict IntegratorShadowState;
typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

直接将IntegratorStateCPU作为IntegratorState,

在GPU中,IntegratorState的定义是这样的:

typedef int IntegratorState;
typedef int ConstIntegratorState;
typedef int IntegratorShadowState;
typedef int ConstIntegratorShadowState;

IntegratorState就是一个int值。在CUDA中,这个IntegratorState是线程的索引,每个线程计算光线追踪的一条path。

kernel_integrator_state

通过上面对于IntegratorState的分析可以知道,GPU上的IntegratorState数据是通过索引从kernel_integrator_state获取的。这里以CUDA为例,了解kernel_integrator_state是如何定义的,包含了哪些内容。

在 src\kernel\device\cuda\globals.h 文件中,可以看到如下定义:

struct KernelParamsCUDA {
  /* Global scene data and textures */
  KernelData data;
#define KERNEL_DATA_ARRAY(type, name) const type *name;
#include "kernel/data_arrays.h"

  /* Integrator state */
  IntegratorStateGPU integrator_state;
};

#ifdef __KERNEL_GPU__
__constant__ KernelParamsCUDA kernel_params;
#endif

/* Abstraction macros */
#define kernel_data kernel_params.data
#define kernel_data_fetch(name, index) kernel_params.name[(index)]
#define kernel_data_array(name) (kernel_params.name)
#define kernel_integrator_state kernel_params.integrator_state

从这里可以看到kernel_integrator_state是一个IntegratorStateGPU类型的数据,它保存在KernelParamsCUDA结构体中。kernel_params是KernelParamsCUDA结构体的实例,它在初始化完成后将数据copy到GPU。

辅助函数

在src\kernel\integrator\state_util.h文件中定义了一些辅助函数,它利用INTEGRATOR_STATE的宏定义,将IntegratorState中对一个结构体的读写操作封装到一起,比如下面就是对IntegratorState的Ray数据的读写操作:

/* Ray */

ccl_device_forceinline void integrator_state_write_ray(IntegratorState state,
                                                       ccl_private const Ray *ccl_restrict ray)
{
  INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P;
  INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D;
  INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin;
  INTEGRATOR_STATE_WRITE(state, ray, tmax) = ray->tmax;
  INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time;
  INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP;
  INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD;
}

ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state,
                                                      ccl_private Ray *ccl_restrict ray)
{
  ray->P = INTEGRATOR_STATE(state, ray, P);
  ray->D = INTEGRATOR_STATE(state, ray, D);
  ray->tmin = INTEGRATOR_STATE(state, ray, tmin);
  ray->tmax = INTEGRATOR_STATE(state, ray, tmax);
  ray->time = INTEGRATOR_STATE(state, ray, time);
  ray->dP = INTEGRATOR_STATE(state, ray, dP);
  ray->dD = INTEGRATOR_STATE(state, ray, dD);
}