Quantcast
Channel: Parallel Computing in C++ and Native Code forum
Viewing all articles
Browse latest Browse all 856

Need help on C++ AMP Stagging Implementation

$
0
0

Hi and I want to first apologize first if posted this in the wrong forum category. I just started to code with C++ AMP, currently trying to implement Stagging Data transfer concept into my code. But I have no clue on how to verify my staging is coded or working correctly or not.

My current understanding:

#include <amp.h>
#include <iostream>

#define DATA_TYPE float
#define TILE_SIZE 32

using namespace std;
using namespace concurrency;

bool verify(vector<float>& v_res, vector<float>& v_ref, int len)
{
    bool passed = true;

    for (int i = 0; i < len; ++i)
    {
        if (fabs(v_res[i] - v_ref[i]) > 0.01)
        {
            printf("v_res[%d] = %f, v_ref[%d] = %f\n", i, v_res[i], i, v_ref[i]);
            passed = false;
            break;
        }
    }

    return passed;
}

double ElapsedTime(const LARGE_INTEGER& start, const LARGE_INTEGER& end) {
    LARGE_INTEGER freq;
    QueryPerformanceFrequency(&freq);
    return (double(end.QuadPart) - double(start.QuadPart))*1000.0 / double(freq.QuadPart);
}


int main()
{
    LARGE_INTEGER start, end;

    const int M = 32768;
    const int K = 1024;
    const int N = 1024;

    vector<DATA_TYPE> inData(M * K);
    vector<DATA_TYPE> outData(M * K);
    vector<DATA_TYPE> outBuffer(M* K);

    generate(inData.begin(), inData.end(), [] {return static_cast<DATA_TYPE>(rand() % RAND_MAX) / RAND_MAX; });

    accelerator cpu(accelerator::cpu_accelerator);
    accelerator gpu(accelerator::default_accelerator);
    wcout << "CPU " << cpu.get_description() << endl;
    wcout << "GPU " << gpu.get_description() << endl<<endl;
    accelerator_view cpu_view(cpu.default_view);
    accelerator_view gpu_view(gpu.default_view);

    QueryPerformanceCounter(&start);
    concurrency::array<DATA_TYPE, 2> inStag(M, K, inData.begin(), inData.end(), cpu_view, gpu_view);
    QueryPerformanceCounter(&end);
    printf("Copy into vector->inStag: %3.5f ms\n", ElapsedTime(start, end));
    concurrency::array<DATA_TYPE, 2> outStag(M, K, gpu_view, cpu_view);

    concurrency::array<DATA_TYPE, 2> deviceInArray(M, K, gpu_view, cpu_view);
    concurrency::array<DATA_TYPE, 2> deviceOutArray(M, K, gpu_view, cpu_view);
    array_view<DATA_TYPE, 2> deviceInArrayView(deviceInArray);
    array_view<DATA_TYPE, 2> deviceOutArrayView(deviceOutArray);
    QueryPerformanceCounter(&start);
    copy(inStag, deviceInArrayView);
    QueryPerformanceCounter(&end);
    printf("Copy into inputStag->deviceInArray: %3.5f ms\n", ElapsedTime(start, end));

    array_view<DATA_TYPE, 2> outStagView(outStag); outStagView.discard_data();

    QueryPerformanceCounter(&start);
    parallel_for_each(deviceInArrayView.extent, [=](index<2> idx) restrict(amp) { deviceOutArrayView[idx] = deviceInArrayView[idx]; });
    QueryPerformanceCounter(&end);
    printf("GPU Processing [deviceInArray->deviceOutArray] time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(deviceOutArrayView, outStagView);
    QueryPerformanceCounter(&end);
    printf("deviceOutArrayView->outStagView time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(outStagView, outData.begin());
    QueryPerformanceCounter(&end);
    printf("copy to outStagView->vector outData: %3.5f ms\n", ElapsedTime(start, end));
    printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");

    printf("\n\nnon-stagging\n\n");

    QueryPerformanceCounter(&start);
    concurrency::array<DATA_TYPE, 2> inNoStag(M, K, inData.begin(), inData.end());
    QueryPerformanceCounter(&end);
    printf("Copy into vector->inputView: %3.5f ms\n", ElapsedTime(start, end));
    concurrency::array<DATA_TYPE, 2> outNoStag(M, K);

    array_view<DATA_TYPE, 2> inNoStagView(inNoStag);
    array_view<DATA_TYPE, 2> outNoStagView(outNoStag); outNoStagView.discard_data();

    QueryPerformanceCounter(&start);
    parallel_for_each(inNoStagView.extent, [=](index<2> idx) restrict(amp) { outNoStagView[idx] = inNoStagView[idx]; });
    QueryPerformanceCounter(&end);
    printf("GPU Processing [inputView->outView] time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    outNoStagView.synchronize();
    QueryPerformanceCounter(&end);
    printf("outStagView Synchro time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(outNoStagView, outData.begin());
    QueryPerformanceCounter(&end);
    printf("copy to outView->outData: %3.5f ms\n", ElapsedTime(start, end));
    verify(inData, outData, inData.size());
    printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");

    return 0;
}


Is there anything wrong with my implementation? The non-stagging is significantly faster than my stagging. I think output time measurement below show something is wrong with my stagging

CPU CPU accelerator
GPU Intel(R) HD Graphics 4000

Copy into std::vector->inStag: 71.40711 ms
Copy into inputStag->deviceInArray: 0.09693 ms
GPU Processing [deviceInArray->deviceOutArray] time: 2.38620 ms
deviceOutArrayView->outStagView time: 0.03991 ms
copy to outStagView->std::vector outData: 263.39368 ms
matched


non-stagging

Copy into std::vector->inputView: 71.59185 ms
GPU Processing [inputView->outView] time: 0.22180 ms
outStagView Synchro time: 0.00513 ms
copy to outView->outData: 134.93717 ms
matched





Viewing all articles
Browse latest Browse all 856

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>