Moderators: Sascha Willems, walaber
typedef struct
{
uint m_inputSize;
uint m_outputSize;
uint m_parametersBatchSize;
uint m_parametersStartOffset;
uint m_inputOutputSize;
uint m_inputOutputStartOffset;
uint m_unused[4];
} UniformBufferObject;
)"""";
__kernel void brainCopyInput(__global const UniformBufferObject* parameters, __global float* inputOutputData, __global float* inputBuffer)
{
uint itemId = get_local_id(0);
uint groupId = get_group_id(0);
uint workGroupSize = get_local_size(0);
uint inputSize = parameters->m_inputSize;
uint inputOutputSize = parameters->m_inputOutputSize;
uint inputOutputStartOffset = parameters->m_inputOutputStartOffset;
uint srcBase = groupId * inputSize;
uint dstBase = groupId * inputOutputSize + inputOutputStartOffset;
uint workGroupSizeReminder = inputSize % workGroupSize;
uint modWorkGroupSize = inputSize - workGroupSizeReminder;
for (uint i = 0; i < modWorkGroupSize; i += workGroupSize)
{
float a = inputBuffer[srcBase + i + itemId];
inputOutputData[dstBase + i + itemId] = a;
}
if (itemId < workGroupSizeReminder)
{
float a = inputBuffer[srcBase + modWorkGroupSize + itemId];
inputOutputData[dstBase + modWorkGroupSize + itemId] = a;
}
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_89, texmode_independent
.address_size 64
// .globl brainCopyInput
// brainLayerLinear_$_cachedInput has been demoted
// brainLayerLinear_$_cachedOutput has been demoted
// brainLayerLinear_$_reductionBuffer has been demoted
// brainLayerSoftmaxActivation_$_tmpInputBuffer has been demoted
// brainLayerSoftmaxActivation_$_reductionBuffer has been demoted
.entry brainCopyInput(
.param .u64 .ptr .global .align 4 brainCopyInput_param_0,
.param .u64 .ptr .global .align 4 brainCopyInput_param_1,
.param .u64 .ptr .global .align 4 brainCopyInput_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<3>;
.reg .b32 %r<25>;
.reg .b64 %rd<12>;
ld.param.u64 %rd3, [brainCopyInput_param_0];
ld.param.u64 %rd1, [brainCopyInput_param_1];
ld.param.u64 %rd2, [brainCopyInput_param_2];
mov.u32 %r1, %tid.x;
mov.u32 %r11, %ctaid.x;
mov.b32 %r12, %envreg0;
add.s32 %r13, %r12, %r11;
ld.global.u32 %r14, [%rd3];
mul.lo.s32 %r2, %r14, %r13;
ld.global.u32 %r15, [%rd3+16];
ld.global.u32 %r16, [%rd3+20];
mad.lo.s32 %r3, %r15, %r13, %r16;
mov.u32 %r4, %ntid.x;
rem.u32 %r5, %r14, %r4;
sub.s32 %r6, %r14, %r5;
setp.eq.s32 %p1, %r6, 0;
@%p1 bra $L__BB0_3;
add.s32 %r7, %r2, %r1;
add.s32 %r8, %r3, %r1;
mov.u32 %r24, 0;
$L__BB0_2:
add.s32 %r18, %r7, %r24;
mul.wide.u32 %rd4, %r18, 4;
add.s64 %rd5, %rd2, %rd4;
// this is at least for time slower that in cuda, the code should isseus a 256 or 128 bit cpy st.global.f128
ld.global.f32 %f1, [%rd5];
add.s32 %r19, %r8, %r24;
mul.wide.u32 %rd6, %r19, 4;
add.s64 %rd7, %rd1, %rd6;
// this is at least for time slower that in cuda, the code should isseus a 256 or 128 bit cpy st.global.f128
st.global.f32 [%rd7], %f1;
add.s32 %r24, %r24, %r4;
setp.lt.u32 %p2, %r24, %r6;
@%p2 bra $L__BB0_2;
$L__BB0_3:
setp.le.u32 %p3, %r5, %r1;
@%p3 bra $L__BB0_5;
add.s32 %r20, %r2, %r1;
add.s32 %r21, %r20, %r6;
mul.wide.u32 %rd8, %r21, 4;
add.s64 %rd9, %rd2, %rd8;
// this is at least for time slower that in cuda, the code should isseus a 256 or 128 bit cpy st.global.f128
ld.global.f32 %f2, [%rd9];
add.s32 %r22, %r3, %r1;
add.s32 %r23, %r22, %r6;
mul.wide.u32 %rd10, %r23, 4;
add.s64 %rd11, %rd1, %rd10;
// this is at least for time slower that in cuda, the code should isseus a 256 or 128 bit cpy st.global.f128
st.global.f32 [%rd11], %f2;
$L__BB0_5:
ret;
}
float8* ptr0 = (float8*)&inputBuffer[srcBase];
float8* ptr1 = (float8*)&inputOutputData[dstBase];
for (uint i = 0; i < modWorkGroupSize; i += workGroupSize)
{
//float a = inputBuffer[srcBase + i + itemId];
//inputOutputData[dstBase + i + itemId] = a;
float8 a = ptr0[i + itemId];
ptr1[i + itemId] = a;
}
$L__BB0_2:
add.s32 %r16, %r21, %r1;
mul.wide.u32 %rd8, %r16, 32;
add.s64 %rd9, %rd2, %rd8;
// nvidia unrooll the float 8 into two 128 bit transactions,
// this code should be at least four time faster
ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd9];
ld.global.v4.f32 {%f9, %f10, %f11, %f12}, [%rd9+16];
add.s64 %rd10, %rd1, %rd8;
st.global.v4.f32 [%rd10+16], {%f9, %f10, %f11, %f12};
st.global.v4.f32 [%rd10], {%f1, %f2, %f3, %f4};
add.s32 %r21, %r21, %r4;
setp.lt.u32 %p2, %r21, %r6;
@%p2 bra $L__BB0_2;
$L__BB0_2:
add.s32 %r16, %r21, %r1;
mul.wide.u32 %rd8, %r16, 64;
add.s64 %rd9, %rd2, %rd8;
ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd9];
ld.global.v4.f32 {%f9, %f10, %f11, %f12}, [%rd9+16];
ld.global.v4.f32 {%f17, %f18, %f19, %f20}, [%rd9+32];
ld.global.v4.f32 {%f25, %f26, %f27, %f28}, [%rd9+48];
add.s64 %rd10, %rd1, %rd8;
st.global.v4.f32 [%rd10+48], {%f25, %f26, %f27, %f28};
st.global.v4.f32 [%rd10+32], {%f17, %f18, %f19, %f20};
st.global.v4.f32 [%rd10+16], {%f9, %f10, %f11, %f12};
st.global.v4.f32 [%rd10], {%f1, %f2, %f3, %f4};
add.s32 %r21, %r21, %r4;
setp.lt.u32 %p2, %r21, %r6;
@%p2 bra $L__BB0_2;
Julio Jerez wrote:I find that the CPU run faster in all cases.
The one thing I notice is that It seem Opencl is better for GPU computing that vulkan.
it’s often less optimized than PTX produced directly from CUDA code.
strides
Users browsing this forum: No registered users and 0 guests