I have a series of arrays that I need to convolve. They are not all the same size, but their size does have an upper bound which is ~50. The offload looks something like this (simplified):
#pragma offload target(mic) { #pragma omp parallel for for (long j=0; j<(long)1e9; ++j) { VSLConvTaskPtr task; float x[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}; float y[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}; float z[59]={0}; MKL_INT xshape=30, yshape=30, zshape=59; int status; int mode = VSL_CONV_MODE_AUTO; status = vslsConvNewTask1D(&task,mode,xshape,yshape,zshape); CheckVslError(status); status = vslsConvExec1D(task,x,1,y,1,z,1); CheckVslError(status); status = vslConvDeleteTask(&task); CheckVslError(status); } }When I profile this, over half the time is spent in mkl_serv_malloc, mkl_serv_free, and mkl_conv_newtask. Note that in the actual code, xshape, yshape, and zshape are not constant (but are bounded). Is there any way to drive this so that the allocation overhead is eliminated?