diff --git a/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu b/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu new file mode 100644 index 0000000..3fa034c --- /dev/null +++ b/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu @@ -0,0 +1,165 @@ +/* +Point cloud feature pooling +Written by Shaoshuai Shi +All Rights Reserved 2018. +*/ + +#include +#include + +#define THREADS_PER_BLOCK 256 +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) +// #define DEBUG + + +__device__ inline void lidar_to_local_coords(float shift_x, float shift_y, float rot_angle, float &local_x, float &local_y){ + float cosa = cos(-rot_angle), sina = sin(-rot_angle); + local_x = shift_x * cosa + shift_y * (-sina); + local_y = shift_x * sina + shift_y * cosa; +} + + +__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d, float &local_x, float &local_y){ + // param pt: (x, y, z) + // param box3d: [x, y, z, dx, dy, dz, heading] (x, y, z) is the box center + + const float MARGIN = 1e-5; + float x = pt[0], y = pt[1], z = pt[2]; + float cx = box3d[0], cy = box3d[1], cz = box3d[2]; + float dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6]; + + if (fabsf(z - cz) > dz / 2.0) return 0; + lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); + float in_flag = (fabs(local_x) < dx / 2.0 + MARGIN) & (fabs(local_y) < dy / 2.0 + MARGIN); + return in_flag; +} + + +__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const float *xyz, const float *boxes3d, int *pts_assign){ + // params xyz: (B, N, 3) + // params boxes3d: (B, M, 7) + // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means background points + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + + if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size){ + return; + } + int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; + pts_assign[assign_idx] = 0; + + int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; + int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; + + + float local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y); + pts_assign[assign_idx] = cur_in_flag; + // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx * pts_num + pt_idx]); +} + + +__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num, + const int *pts_assign, int *pts_idx, int *pooled_empty_flag){ + // params xyz: (B, N, 3) + // params pts_feature: (B, N, C) + // params pts_assign: (B, N) + // params pts_idx: (B, M, 512) + // params pooled_empty_flag: (B, M) + + int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (boxes_idx >= boxes_num){ + return; + } + + int bs_idx = blockIdx.y; + + int cnt = 0; + for (int k = 0; k < pts_num; k++){ + if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]){ + if (cnt < sampled_pts_num){ + pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k; + cnt++; + } + else break; + } + } + + if (cnt == 0){ + pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; + } + else if (cnt < sampled_pts_num){ + // duplicate same points for sampling + for (int k = cnt; k < sampled_pts_num; k++){ + int duplicate_idx = k % cnt; + int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; + pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; + } + } +} + + +__global__ void roipool3d_forward(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, + const float *xyz, const int *pts_idx, const float *pts_feature, + float *pooled_features, int *pooled_empty_flag){ + // params xyz: (B, N, 3) + // params pts_idx: (B, M, 512) + // params pts_feature: (B, N, C) + // params pooled_features: (B, M, 512, 3+C) + // params pooled_empty_flag: (B, M) + + int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + int box_idx = blockIdx.y; + int bs_idx = blockIdx.z; + + if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || bs_idx >= batch_size){ + return; + } + + if (pooled_empty_flag[bs_idx * boxes_num + box_idx]){ + return; + } + + int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx; + int src_pt_idx = pts_idx[temp_idx]; + int dst_feature_offset = temp_idx * (3 + feature_in_len); + + for (int j = 0; j < 3; j++) + pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; + + int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; + for (int j = 0; j < feature_in_len; j++) + pooled_features[dst_feature_offset + 3 + j] = pts_feature[src_feature_offset + j]; +} + + +void roipool3dLauncher(int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, + const float *xyz, const float *boxes3d, const float *pts_feature, float *pooled_features, int *pooled_empty_flag){ + + // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num, boxes_num); + int *pts_assign = NULL; + cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num * sizeof(int)); // (batch_size, N, M) + // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num * sizeof(int)); + + dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK); + assign_pts_to_box3d<<>>(batch_size, pts_num, boxes_num, xyz, boxes3d, pts_assign); + + int *pts_idx = NULL; + cudaMalloc(&pts_idx, batch_size * boxes_num * sampled_pts_num * sizeof(int)); // (batch_size, M, sampled_pts_num) + + dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size); // blockIdx.x(col), blockIdx.y(row) + get_pooled_idx<<>>(batch_size, pts_num, boxes_num, sampled_pts_num, pts_assign, pts_idx, pooled_empty_flag); + + dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); + roipool3d_forward<<>>(batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, + xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag); + + cudaFree(pts_assign); + cudaFree(pts_idx); + +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} \ No newline at end of file