Unable to Find Mean and Var in Single Kernel Code
Dear Scholars,
I am unable to implement mean and var in single kernel call.Goal: I need to find mean and var of sub matrixs in a matrix. so I wrote following kernels
functions:
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
global void kernelMeanVar(d_a, d_mean, d_var,...);
Issue: a) if I compute the kernelMean and kernelVar individually it works fine b) if I want to compute kernelMeanVar in single kernel is does not work.
Below is my code, kindly let me know on possible any errors.
Thanking you in advance.
Regards,
Nagaraju
global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
if(thread_id <total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
}
}
global void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0;
float mean = 0;
float var = 0;
float ltotal_elts = (float) localRows*localCols;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sum = sum + temp;
sumsq = sumsq + (temp*temp);
}
}
mean = sum/ltotal_elts;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}
global void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
int block_id = blockIdx.x + gridDim.x * blockIdx.y;
int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
int my_rowIdx = thread_id/globalCols ;
int my_colIdx = thread_id%globalCols ;
int i,j;
float temp;
float sum = 0;
float sumsq = 0.0;
float mean;
float ltotal_elts = (float) (localRows*localCols);
device_mean[thread_id] = 0;
device_var[thread_id] = 0;
if(thread_id < total_num_threads)
{
for(i=0; i < localRows ; i++)
{
for(j=0 ; j < localCols; j++)
{
temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
sumsq = sumsq + (temp*temp);
sum = sum + temp;
}
}
mean = sum/ltotal_elts;
device_mean[thread_id] = mean;
device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean);
}
}
Kernel Call Functions
void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*float_ary)[i] = (float) double_ary[i];
//printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
}
return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
for(int i = 0; i < num_elts ; i++)
{
(*double_ary)[i] = (double) float_ary[i];
}
return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
float* host_array_float;
float* device_array;
float* host_mean;
float* host_var;
float* device_mean;
float* device_var;
double total_bytes =0;
host_array_float = (float*) malloc (num_elts*sizeof(float));
convertToFloat(&host_array_float, host_array, num_elts);
//printf("num_elts %d \n", num_elts);
cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
cudaMemset(device_array, 0, sizeof(float)* num_elts);
cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
int numBlockThreads = MAX_THREADS_PER_BLOCK;
int num_blocks = 0;
int remain_elts = 0;
int total_num_threads = globalRows * globalCols;
cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
cudaMemset(device_var, 0, sizeof(float)* total_num_t开发者_Python百科hreads);
num_blocks = total_num_threads/numBlockThreads;
remain_elts = total_num_threads%numBlockThreads;
if(remain_elts > 0)
{
num_blocks++;
}
dim3 gridDim(num_blocks,1);
dim3 blockDim(numBlockThreads,1);
//kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
//kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
host_mean = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_mean, 0, sizeof(float) * total_num_threads);
host_var = (float*) malloc( sizeof(float) * total_num_threads);
memset(host_var, 0, sizeof(float) * total_num_threads);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
//if(error!=cudaSuccess) {
printf("ERROR: %s\n", cudaGetErrorString(error) );
//}
cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
convertToDouble(mean_ary, host_mean, total_num_threads);
cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
for(int i = 0 ; i < 300 ; i++)
printf("host_var[%d] %f \n",i, host_var[i]);
convertToDouble(var_ary, host_var, total_num_threads);
cudaFree(device_array);
cudaFree(device_mean);
cudaFree(device_var);
free(host_mean);
free(host_var);
free(host_array_float);
}
Results with enabling
global void kernelMean(d_a, d_mean, ...);
global void kernelVar(d_a, d_var, ...);
ERROR: no error
host_var[0] 4.497070
host_var[1] 5.061768
host_var[2] 5.687500
host_var[3] 6.347534
host_var[4] 6.829102
host_var[5] 12.940308
host_var[6] 14.309937
host_var[7] 15.141113
host_var[8] 18.741577
host_var[9] 21.323608
host_var[10] 21.727417
host_var[11] 192.348389
host_var[12] 579.911621
host_var[13] 800.821045
host_var[14] 1071.960938
host_var[15] 1077.261719
host_var[16] 993.262207
host_var[17] 924.379883
host_var[18] 839.437012
host_var[19] 810.847656
host_var[20] 835.007813
host_var[21] 1124.365723
host_var[22] 1241.685547
host_var[23] 1376.504150
host_var[24] 1196.745850
host_var[25] 1097.473877
host_var[26] 1008.840088
host_var[27] 867.585083
host_var[28] 794.241699
host_var[29] 1322.409790
host_var[30] 1556.029785
host_var[31] 1564.997803
host_var[32] 1870.985840
host_var[33] 1929.829590
host_var[34] 1822.189453
host_var[35] 1662.321777
host_var[36] 1372.886719
host_var[37] 1074.727539
host_var[38] 833.003906
host_var[39] 632.514648
host_var[40] 380.227539
host_var[41] 87.345703
host_var[42] 82.544922
host_var[43] 78.756836
host_var[44] 68.541016
host_var[45] 61.981445
host_var[46] 60.413086
host_var[47] 60.128906
host_var[48] 59.767578
host_var[49] 59.223633
host_var[50] 56.569336
host_var[51] 53.866211
host_var[52] 51.186523
host_var[53] 55.270508
host_var[54] 59.956055
host_var[55] 66.516602
host_var[56] 70.348633
host_var[57] 71.706055
host_var[58] 70.494141
host_var[59] 69.897461
host_var[60] 66.286133
host_var[61] 67.926758
host_var[62] 160.753906
host_var[63] 447.221191
host_var[64] 831.740723
host_var[65] 1076.513672
host_var[66] 1193.666992
host_var[67] 1208.239746
host_var[68] 1126.845947
host_var[69] 948.397461
host_var[70] 669.399414
host_var[71] 340.465576
host_var[72] 67.161865
host_var[73] 7.421082
host_var[74] 5.485626
host_var[75] 5.135620
host_var[76] 3.460419
host_var[77] 3.853577
host_var[78] 5.221100
host_var[79] 5.890381
host_var[80] 7.139618
host_var[81] 7.517609
host_var[82] 6.865875
host_var[83] 5.053909
host_var[84] 2.781616
host_var[85] 2.021912
host_var[86] 2.130417
host_var[87] 3.113586
host_var[88] 4.024399
host_var[89] 4.582413
host_var[90] 4.077118
host_var[91] 3.024384
host_var[92] 2.287506
host_var[93] 1.793579
host_var[94] 1.567474
host_var[95] 1.829895
host_var[96] 2.325928
host_var[97] 3.429993
host_var[98] 3.885559
host_var[99] 3.835602
host_var[100] 5.566406
host_var[101] 8.065582
host_var[102] 18.767456
host_var[103] 35.395599
host_var[104] 64.148407
host_var[105] 125.937866
host_var[106] 176.445618
host_var[107] 216.073059
host_var[108] 272.109985
host_var[109] 307.972412
host_var[110] 289.652344
host_var[111] 238.253662
host_var[112] 178.304932
host_var[113] 116.925049
host_var[114] 74.773926
host_var[115] 61.227295
host_var[116] 55.238525
host_var[117] 55.387451
host_var[118] 49.241699
host_var[119] 38.396240
host_var[120] 28.304932
host_var[121] 20.225342
host_var[122] 18.043457
host_var[123] 21.418457
host_var[124] 26.120117
host_var[125] 25.899414
host_var[126] 26.641602
host_var[127] 23.747437
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945
Reusults with enabling
global void kernelMeanVar(d_a, d_mean, d_var,...);
ERROR: no error
host_var[0] 0.000000
host_var[1] 0.000000
host_var[2] 0.000000
host_var[3] 0.000000
host_var[4] 0.000000
host_var[5] 0.000000
host_var[6] 0.000000
host_var[7] 0.000000
host_var[8] 0.000000
host_var[9] 0.000000
host_var[10] 0.000000
host_var[11] 0.000000
host_var[12] 0.000000
host_var[13] 0.000000
host_var[14] 0.000000
host_var[15] 0.000000
host_var[16] 0.000000
host_var[17] 0.000000
host_var[18] 0.000000
host_var[19] 0.000000
host_var[20] 0.000000
host_var[21] 0.000000
host_var[22] 0.000000
host_var[23] 0.000000
host_var[24] 0.000000
host_var[25] 0.000000
host_var[26] 0.000000
host_var[27] 0.000000
host_var[28] 0.000000
host_var[29] 0.000000
host_var[30] 0.000000
host_var[31] 0.000000
host_var[32] 0.000000
host_var[33] 0.000000
host_var[34] 0.000000
host_var[35] 0.000000
host_var[36] 0.000000
host_var[37] 0.000000
host_var[38] 0.000000
host_var[39] 0.000000
host_var[40] 0.000000
host_var[41] 0.000000
host_var[42] 0.000000
host_var[43] 0.000000
host_var[44] 0.000000
host_var[45] 0.000000
host_var[46] 0.000000
host_var[47] 0.000000
host_var[48] 0.000000
host_var[49] 0.000000
host_var[50] 0.000000
host_var[51] 0.000000
host_var[52] 0.000000
host_var[53] 0.000000
host_var[54] 0.000000
host_var[55] 0.000000
host_var[56] 0.000000
host_var[57] 0.000000
host_var[58] 0.000000
host_var[59] 0.000000
host_var[60] 0.000000
host_var[61] 0.000000
host_var[62] 0.000000
host_var[63] 0.000000
host_var[64] 0.000000
host_var[65] 0.000000
host_var[66] 0.000000
host_var[67] 0.000000
host_var[68] 0.000000
host_var[69] 0.000000
host_var[70] 0.000000
host_var[71] 0.000000
host_var[72] 0.000000
host_var[73] 0.000000
host_var[74] 0.000000
host_var[75] 0.000000
host_var[76] 0.000000
host_var[77] 0.000000
host_var[78] 0.000000
host_var[79] 0.000000
host_var[80] 0.000000
host_var[81] 0.000000
host_var[82] 0.000000
host_var[83] 0.000000
host_var[84] 0.000000
host_var[85] 0.000000
host_var[86] 0.000000
host_var[87] 0.000000
host_var[88] 0.000000
host_var[89] 0.000000
host_var[90] 0.000000
host_var[91] 0.000000
host_var[92] 0.000000
host_var[93] 0.000000
host_var[94] 0.000000
host_var[95] 0.000000
host_var[96] 0.000000
host_var[97] 0.000000
host_var[98] 0.000000
host_var[99] 0.000000
host_var[100] 0.000000
host_var[101] 0.000000
host_var[102] 0.000000
host_var[103] 0.000000
host_var[104] 0.000000
host_var[105] 0.000000
host_var[106] 0.000000
host_var[107] 0.000000
host_var[108] 0.000000
host_var[109] 0.000000
host_var[110] 0.000000
host_var[111] 0.000000
host_var[112] 0.000000
host_var[113] 0.000000
host_var[114] 0.000000
host_var[115] 0.000000
host_var[116] 0.000000
host_var[117] 0.000000
host_var[118] 0.000000
host_var[119] 0.000000
host_var[120] 0.000000
host_var[121] 0.000000
host_var[122] 0.000000
host_var[123] 0.000000
host_var[124] 0.000000
host_var[125] 0.000000
host_var[126] 0.000000
host_var[127] 0.000000
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945
End of Results
Suggestions:
- [First priority] Check return values from CUDA functions to see whether any errors are reported.
- Run this through cuda-memcheck. I'm not sure what the relationship is between globalRows, globalCols, localRows, localCols, num_elts etc. is but reading out-of-bounds seems like a candadite for problems.
- Remember that summing the squares can lead to rounding errors fairly quickly if you don't take care. Consider using a running mean/variance or doing a tree-based reduction.
精彩评论