Hi and I want to first apologize first if posted this in the wrong forum category. I just started to code with C++ AMP, currently trying to implement Stagging Data transfer concept into my code. But I have no clue on how to verify my staging is coded or working correctly or not.
My current understanding:
#include <amp.h>
#include <iostream>
#define DATA_TYPE float
#define TILE_SIZE 32
using namespace std;
using namespace concurrency;
bool verify(vector<float>& v_res, vector<float>& v_ref, int len)
{
bool passed = true;
for (int i = 0; i < len; ++i)
{
if (fabs(v_res[i] - v_ref[i]) > 0.01)
{
printf("v_res[%d] = %f, v_ref[%d] = %f\n", i, v_res[i], i, v_ref[i]);
passed = false;
break;
}
}
return passed;
}
double ElapsedTime(const LARGE_INTEGER& start, const LARGE_INTEGER& end) {
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
return (double(end.QuadPart) - double(start.QuadPart))*1000.0 / double(freq.QuadPart);
}
int main()
{
LARGE_INTEGER start, end;
const int M = 32768;
const int K = 1024;
const int N = 1024;
vector<DATA_TYPE> inData(M * K);
vector<DATA_TYPE> outData(M * K);
vector<DATA_TYPE> outBuffer(M* K);
generate(inData.begin(), inData.end(), [] {return static_cast<DATA_TYPE>(rand() % RAND_MAX) / RAND_MAX; });
accelerator cpu(accelerator::cpu_accelerator);
accelerator gpu(accelerator::default_accelerator);
wcout << "CPU " << cpu.get_description() << endl;
wcout << "GPU " << gpu.get_description() << endl<<endl;
accelerator_view cpu_view(cpu.default_view);
accelerator_view gpu_view(gpu.default_view);
QueryPerformanceCounter(&start);
concurrency::array<DATA_TYPE, 2> inStag(M, K, inData.begin(), inData.end(), cpu_view, gpu_view);
QueryPerformanceCounter(&end);
printf("Copy into vector->inStag: %3.5f ms\n", ElapsedTime(start, end));
concurrency::array<DATA_TYPE, 2> outStag(M, K, gpu_view, cpu_view);
concurrency::array<DATA_TYPE, 2> deviceInArray(M, K, gpu_view, cpu_view);
concurrency::array<DATA_TYPE, 2> deviceOutArray(M, K, gpu_view, cpu_view);
array_view<DATA_TYPE, 2> deviceInArrayView(deviceInArray);
array_view<DATA_TYPE, 2> deviceOutArrayView(deviceOutArray);
QueryPerformanceCounter(&start);
copy(inStag, deviceInArrayView);
QueryPerformanceCounter(&end);
printf("Copy into inputStag->deviceInArray: %3.5f ms\n", ElapsedTime(start, end));
array_view<DATA_TYPE, 2> outStagView(outStag); outStagView.discard_data();
QueryPerformanceCounter(&start);
parallel_for_each(deviceInArrayView.extent, [=](index<2> idx) restrict(amp) { deviceOutArrayView[idx] = deviceInArrayView[idx]; });
QueryPerformanceCounter(&end);
printf("GPU Processing [deviceInArray->deviceOutArray] time: %3.5f ms\n", ElapsedTime(start, end));
QueryPerformanceCounter(&start);
copy(deviceOutArrayView, outStagView);
QueryPerformanceCounter(&end);
printf("deviceOutArrayView->outStagView time: %3.5f ms\n", ElapsedTime(start, end));
QueryPerformanceCounter(&start);
copy(outStagView, outData.begin());
QueryPerformanceCounter(&end);
printf("copy to outStagView->vector outData: %3.5f ms\n", ElapsedTime(start, end));
printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");
printf("\n\nnon-stagging\n\n");
QueryPerformanceCounter(&start);
concurrency::array<DATA_TYPE, 2> inNoStag(M, K, inData.begin(), inData.end());
QueryPerformanceCounter(&end);
printf("Copy into vector->inputView: %3.5f ms\n", ElapsedTime(start, end));
concurrency::array<DATA_TYPE, 2> outNoStag(M, K);
array_view<DATA_TYPE, 2> inNoStagView(inNoStag);
array_view<DATA_TYPE, 2> outNoStagView(outNoStag); outNoStagView.discard_data();
QueryPerformanceCounter(&start);
parallel_for_each(inNoStagView.extent, [=](index<2> idx) restrict(amp) { outNoStagView[idx] = inNoStagView[idx]; });
QueryPerformanceCounter(&end);
printf("GPU Processing [inputView->outView] time: %3.5f ms\n", ElapsedTime(start, end));
QueryPerformanceCounter(&start);
outNoStagView.synchronize();
QueryPerformanceCounter(&end);
printf("outStagView Synchro time: %3.5f ms\n", ElapsedTime(start, end));
QueryPerformanceCounter(&start);
copy(outNoStagView, outData.begin());
QueryPerformanceCounter(&end);
printf("copy to outView->outData: %3.5f ms\n", ElapsedTime(start, end));
verify(inData, outData, inData.size());
printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");
return 0;
}
Is there anything wrong with my implementation? The non-stagging is significantly faster than my stagging. I think output time measurement below show something is wrong with my stagging
CPU CPU accelerator GPU Intel(R) HD Graphics 4000 Copy into std::vector->inStag: 71.40711 ms Copy into inputStag->deviceInArray: 0.09693 ms GPU Processing [deviceInArray->deviceOutArray] time: 2.38620 ms deviceOutArrayView->outStagView time: 0.03991 ms copy to outStagView->std::vector outData: 263.39368 ms matched non-stagging Copy into std::vector->inputView: 71.59185 ms GPU Processing [inputView->outView] time: 0.22180 ms outStagView Synchro time: 0.00513 ms copy to outView->outData: 134.93717 ms matched