Need help on C++ AMP Stagging Implementation

Hi and I want to first apologize first if posted this in the wrong forum category. I just started to code with C++ AMP, currently trying to implement Stagging Data transfer concept into my code. But I have no clue on how to verify my staging is coded or working correctly or not.

My current understanding:

#include <amp.h>
#include <iostream>

#define DATA_TYPE float
#define TILE_SIZE 32

using namespace std;
using namespace concurrency;

bool verify(vector<float>& v_res, vector<float>& v_ref, int len)
{
    bool passed = true;

    for (int i = 0; i < len; ++i)
    {
        if (fabs(v_res[i] - v_ref[i]) > 0.01)
        {
            printf("v_res[%d] = %f, v_ref[%d] = %f\n", i, v_res[i], i, v_ref[i]);
            passed = false;
            break;
        }
    }

    return passed;
}

double ElapsedTime(const LARGE_INTEGER& start, const LARGE_INTEGER& end) {
    LARGE_INTEGER freq;
    QueryPerformanceFrequency(&freq);
    return (double(end.QuadPart) - double(start.QuadPart))*1000.0 / double(freq.QuadPart);
}


int main()
{
    LARGE_INTEGER start, end;

    const int M = 32768;
    const int K = 1024;
    const int N = 1024;

    vector<DATA_TYPE> inData(M * K);
    vector<DATA_TYPE> outData(M * K);
    vector<DATA_TYPE> outBuffer(M* K);

    generate(inData.begin(), inData.end(), [] {return static_cast<DATA_TYPE>(rand() % RAND_MAX) / RAND_MAX; });

    accelerator cpu(accelerator::cpu_accelerator);
    accelerator gpu(accelerator::default_accelerator);
    wcout << "CPU " << cpu.get_description() << endl;
    wcout << "GPU " << gpu.get_description() << endl<<endl;
    accelerator_view cpu_view(cpu.default_view);
    accelerator_view gpu_view(gpu.default_view);

    QueryPerformanceCounter(&start);
    concurrency::array<DATA_TYPE, 2> inStag(M, K, inData.begin(), inData.end(), cpu_view, gpu_view);
    QueryPerformanceCounter(&end);
    printf("Copy into vector->inStag: %3.5f ms\n", ElapsedTime(start, end));
    concurrency::array<DATA_TYPE, 2> outStag(M, K, gpu_view, cpu_view);

    concurrency::array<DATA_TYPE, 2> deviceInArray(M, K, gpu_view, cpu_view);
    concurrency::array<DATA_TYPE, 2> deviceOutArray(M, K, gpu_view, cpu_view);
    array_view<DATA_TYPE, 2> deviceInArrayView(deviceInArray);
    array_view<DATA_TYPE, 2> deviceOutArrayView(deviceOutArray);
    QueryPerformanceCounter(&start);
    copy(inStag, deviceInArrayView);
    QueryPerformanceCounter(&end);
    printf("Copy into inputStag->deviceInArray: %3.5f ms\n", ElapsedTime(start, end));

    array_view<DATA_TYPE, 2> outStagView(outStag); outStagView.discard_data();

    QueryPerformanceCounter(&start);
    parallel_for_each(deviceInArrayView.extent, [=](index<2> idx) restrict(amp) { deviceOutArrayView[idx] = deviceInArrayView[idx]; });
    QueryPerformanceCounter(&end);
    printf("GPU Processing [deviceInArray->deviceOutArray] time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(deviceOutArrayView, outStagView);
    QueryPerformanceCounter(&end);
    printf("deviceOutArrayView->outStagView time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(outStagView, outData.begin());
    QueryPerformanceCounter(&end);
    printf("copy to outStagView->vector outData: %3.5f ms\n", ElapsedTime(start, end));
    printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");

    printf("\n\nnon-stagging\n\n");

    QueryPerformanceCounter(&start);
    concurrency::array<DATA_TYPE, 2> inNoStag(M, K, inData.begin(), inData.end());
    QueryPerformanceCounter(&end);
    printf("Copy into vector->inputView: %3.5f ms\n", ElapsedTime(start, end));
    concurrency::array<DATA_TYPE, 2> outNoStag(M, K);

    array_view<DATA_TYPE, 2> inNoStagView(inNoStag);
    array_view<DATA_TYPE, 2> outNoStagView(outNoStag); outNoStagView.discard_data();

    QueryPerformanceCounter(&start);
    parallel_for_each(inNoStagView.extent, [=](index<2> idx) restrict(amp) { outNoStagView[idx] = inNoStagView[idx]; });
    QueryPerformanceCounter(&end);
    printf("GPU Processing [inputView->outView] time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    outNoStagView.synchronize();
    QueryPerformanceCounter(&end);
    printf("outStagView Synchro time: %3.5f ms\n", ElapsedTime(start, end));

    QueryPerformanceCounter(&start);
    copy(outNoStagView, outData.begin());
    QueryPerformanceCounter(&end);
    printf("copy to outView->outData: %3.5f ms\n", ElapsedTime(start, end));
    verify(inData, outData, inData.size());
    printf("%s\n", verify(inData, outData, inData.size()) ? "matched" : "not matching");

    return 0;
}

Is there anything wrong with my implementation? The non-stagging is significantly faster than my stagging. I think output time measurement below show something is wrong with my stagging

CPU CPU accelerator
GPU Intel(R) HD Graphics 4000

Copy into std::vector->inStag: 71.40711 ms
Copy into inputStag->deviceInArray: 0.09693 ms
GPU Processing [deviceInArray->deviceOutArray] time: 2.38620 ms
deviceOutArrayView->outStagView time: 0.03991 ms
copy to outStagView->std::vector outData: 263.39368 ms
matched


non-stagging

Copy into std::vector->inputView: 71.59185 ms
GPU Processing [inputView->outView] time: 0.22180 ms
outStagView Synchro time: 0.00513 ms
copy to outView->outData: 134.93717 ms
matched

Need help on C++ AMP Stagging Implementation

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List