DeviceQueue

DeviceQueue类似于DX12中的CommandQueue,它提供调用Device算法的接口,是Host于Device交互的桥梁。

DeviceQueue属于Cycles的Device模块,该模块是针对不同渲染设备的抽象,CUDADeviceQueue就是属于DeviceQueue的一种。

DeviceQueue类是一个抽象类,具体的实现由每种不同的Device决定。

DeviceKernelArguments

这是一个用于保证设备内核参数与类型正确性的容器,他作为enqueue函数的参数,因为不同Device的DeviceQueue所需要的参数类型和类型都不一样,使用DeviceKernelArguments将他们统一。

实现:

struct DeviceKernelArguments {
  enum Type {
    POINTER,
    INT32,
    FLOAT32,
    BOOLEAN,
    KERNEL_FILM_CONVERT,
  };
  // 最多有18个参数
  static const int MAX_ARGS = 18;
  // 保存每个参数的类型、值、所占内存大小
  Type types[MAX_ARGS];
  void *values[MAX_ARGS];
  size_t sizes[MAX_ARGS];
  // 已add的参数个数
  size_t count = 0;
  
  // 使用模板的可变参数写法,接受不同类型和属相的参数
  template<class T> DeviceKernelArguments(const T *arg)
  {
    add(arg);
  }

  template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
  {
    add(first);
    add(args...);
  }
  
  // add函数同样使用模板可变参数,会根据不同的参数类型匹配到不同的add函数
  template<typename T, typename... Args> void add(const T *first, Args... args)
  {
    add(first);
    add(args...);
  }
  void add(const float *value)
  {
    add(FLOAT32, value, sizeof(float));
  }
  void add(const bool *value)
  {
    add(BOOLEAN, value, 4);
  }
  ...
  // 真正用于初始化的add函数
  void add(const Type type, const void *value, size_t size)
  {
    assert(count < MAX_ARGS);

    types[count] = type;
    values[count] = (void *)value;
    sizes[count] = size;
    count++;
  }
}

主要函数

  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
   *
   * Use this method after device synchronization has finished before enqueueing any kernels. */
  virtual void init_execution() = 0;

  /* Enqueue kernel execution.
   *
   * Execute the kernel work_size times on the device.
   * Supported arguments types:
   * - int: pass pointer to the int
   * - device memory: pass pointer to device_memory.device_pointer
   * Return false if there was an error executing this or a previous kernel. */
  virtual bool enqueue(DeviceKernel kernel,
                       const int work_size,
                       DeviceKernelArguments const &args) = 0;

  /* Wait unit all enqueued kernels have finished execution.
   * Return false if there was an error executing any of the enqueued kernels. */
  virtual bool synchronize() = 0;

  /* Copy memory to/from device as part of the command queue, to ensure
   * operations are done in order without having to synchronize. */
  virtual void zero_to_device(device_memory &mem) = 0;
  virtual void copy_to_device(device_memory &mem) = 0;
  virtual void copy_from_device(device_memory &mem) = 0;

这些是DeviceQueue最常使用的函数,通过注释可以理解每个函数的功能,init_execution用于初始化运行环境,enqueue将需要执行的渲染算法加入队列,synchronize用于Host和Device的同步,zero_to_device、copy_to_device、copy_from_device三个是内存管理函数。它们都是纯虚函数,具体实现由每种设备负责。

CUDADeviceQueue的实现

以CUDADeviceQueue为例,看一下DeviceQueue中主要的几个函数的具体实现。

void CUDADeviceQueue::init_execution()
{
  /* Synchronize all textures and memory copies before executing task. */
  CUDAContextScope scope(cuda_device_);
  cuda_device_->load_texture_info();
  cuda_device_assert(cuda_device_, cuCtxSynchronize());

  debug_init_execution();
}

init_execution函数初始化CUDAContextScope,并且同步纹理和内存数据。

bool CUDADeviceQueue::enqueue(DeviceKernel kernel,
                              const int work_size,
                              DeviceKernelArguments const &args)
{
  if (cuda_device_->have_error()) {
    return false;
  }

  debug_enqueue_begin(kernel, work_size);

  const CUDAContextScope scope(cuda_device_);
  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);

  /* Compute kernel launch parameters. */
  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
  const int num_blocks = divide_up(work_size, num_threads_per_block);

  int shared_mem_bytes = 0;

  switch (kernel) {
    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
      /* See parall_active_index.h for why this amount of shared memory is needed. */
      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
      break;

    default:
      break;
  }

  /* Launch kernel. */
  assert_success(cuLaunchKernel(cuda_kernel.function,
                                num_blocks,
                                1,
                                1,
                                num_threads_per_block,
                                1,
                                1,
                                shared_mem_bytes,
                                cuda_stream_,
                                const_cast<void **>(args.values),
                                0),
                 "enqueue");

  debug_enqueue_end();

  return !(cuda_device_->have_error());
}

使用Cude执行核心算法,kernel是需要执行的算法。执行CUDA算法时还要分配block的数量和每个block的线程数,如果执行的算法是基于数组的,那么还需要使用shared_mem_bytes指定数组大小[??]。DeviceKernelArguments将会传递到cuLaunchKernel函数中以提供不同算法所需的参数。

bool CUDADeviceQueue::synchronize()
{
  if (cuda_device_->have_error()) {
    return false;
  }

  const CUDAContextScope scope(cuda_device_);
  assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");

  debug_synchronize();

  return !(cuda_device_->have_error());
}

synchronize使用CUDA内置的函数cuStreamSynchronize。

void CUDADeviceQueue::copy_to_device(device_memory &mem)
{
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
    return;
  }

  /* Allocate on demand. */
  if (mem.device_pointer == 0) {
    cuda_device_->mem_alloc(mem);
  }

  assert(mem.device_pointer != 0);
  assert(mem.host_pointer != nullptr);

  /* Copy memory to device. */
  const CUDAContextScope scope(cuda_device_);
  assert_success(
      cuMemcpyHtoDAsync(
          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
      "copy_to_device");
}

三个内存管理的函数相似,以copy_to_device为例。传入device_memory,根据不同的操作使用不同的CUDA内存管理函数,比如copy_to_device使用cuMemcpyHtoDAsync,copy_from_device就使用cuMemcpyDtoHAsync。