//packed_type,acc_type,th_type,element_type,n_elements_per_pack,out_typeTORCH_BINDING_REDUCE(f32, f32,torch::kFloat32, float, 1,float)TORCH_BINDING_REDUCE(f32x4, f32,torch::kFloat32, float, 4,float)TORCH_BINDING_REDUCE(f16, f16,torch::kHalf, half, 1,float)TORCH_BINDING_REDUCE(f16, f32,torch::kHalf, half, 1,float)TORCH_BINDING_REDUCE(f16x2, f16,torch::kHalf, half, 2,float)TORCH_BINDING_REDUCE(f16x2, f32,torch::kHalf, half, 2,float)TORCH_BINDING_REDUCE(f16x8_pack, f16,torch::kHalf, half, 8,float)TORCH_BINDING_REDUCE(f16x8_pack, f32,torch::kHalf, half, 8,float)TORCH_BINDING_REDUCE(bf16, bf16,torch::kBFloat16, __nv_bfloat16, 1,float)TORCH_BINDING_REDUCE(bf16, f32,torch::kBFloat16, __nv_bfloat16, 1,float)TORCH_BINDING_REDUCE(bf16x2, bf16,torch::kBFloat16, __nv_bfloat16, 2,float)TORCH_BINDING_REDUCE(bf16x2, f32,torch::kBFloat16, __nv_bfloat16, 2,float)TORCH_BINDING_REDUCE(bf16x8_pack, bf16,torch::kBFloat16, __nv_bfloat16, 8,float)TORCH_BINDING_REDUCE(bf16x8_pack, f32,torch::kBFloat16, __nv_bfloat16, 8,float)TORCH_BINDING_REDUCE(fp8_e4m3, f16,torch::kFloat8_e4m3fn,__nv_fp8_storage_t,1,float)TORCH_BINDING_REDUCE(fp8_e4m3x16_pack,f16,torch::kFloat8_e4m3fn,__nv_fp8_storage_t,16,float)TORCH_BINDING_REDUCE(fp8_e5m2, f16,torch::kFloat8_e5m2, __nv_fp8_storage_t,1,float)TORCH_BINDING_REDUCE(fp8_e5m2x16_pack,f16,torch::kFloat8_e5m2, __nv_fp8_storage_t,16,float)TORCH_BINDING_REDUCE(i8, i32,torch::kInt8, int8_t, 1,int32_t)TORCH_BINDING_REDUCE(i8x16_pack, i32,torch::kInt8, int8_t, 16,int32_t)