performance - c++ - Np matmul vs - 요소 별 추가가 결합 된 루프보다 개별 루프에서 훨씬 빠른 이유는 무엇입니까?

ICC 설치 / c++ / x86 / vectorization / compiler-optimization

a1 , b1 , c1d1 이 힙 메모리를 가리키고 내 숫자 코드에 다음 코어 루프가 있다고 가정 합니다.

const int n = 100000;

for (int j = 0; j < n; j++) {
    a1[j] += b1[j];
    c1[j] += d1[j];
}
for (int j = 0; j < n; j++) {
    a1[j] += b1[j];
}

for (int j = 0; j < n; j++) {
    c1[j] += d1[j];
}

첫 번째 루프의 디스 어셈블리는 기본적으로 다음과 같습니다 (이 블록은 전체 프로그램에서 약 5 회 반복됩니다).

movsd       xmm0,mmword ptr [edx+18h]
addsd       xmm0,mmword ptr [ecx+20h]
movsd       mmword ptr [ecx+20h],xmm0
movsd       xmm0,mmword ptr [esi+10h]
addsd       xmm0,mmword ptr [eax+30h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [edx+20h]
addsd       xmm0,mmword ptr [ecx+28h]
movsd       mmword ptr [ecx+28h],xmm0
movsd       xmm0,mmword ptr [esi+18h]
addsd       xmm0,mmword ptr [eax+38h]

이중 루프 예제의 각 루프는이 코드를 생성합니다 (다음 블록은 약 세 번 반복됨).

addsd       xmm0,mmword ptr [eax+28h]
movsd       mmword ptr [eax+28h],xmm0
movsd       xmm0,mmword ptr [ecx+20h]
addsd       xmm0,mmword ptr [eax+30h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [ecx+28h]
addsd       xmm0,mmword ptr [eax+38h]
movsd       mmword ptr [eax+38h],xmm0
movsd       xmm0,mmword ptr [ecx+30h]
addsd       xmm0,mmword ptr [eax+40h]
movsd       mmword ptr [eax+40h],xmm0
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>

// # TBB_TIMING 정의

#ifdef TBB_TIMING   
#include <tbb/tick_count.h>
using tbb::tick_count;
#else
#include <time.h>
#endif

using namespace std;

// # preallocate_memory new_cont 정의

enum { new_cont, new_sep };

double *a1, *b1, *c1, *d1;


void allo(int cont, int n)
{
    switch(cont) {
      case new_cont:
        a1 = new double[n*4];
        b1 = a1 + n;
        c1 = b1 + n;
        d1 = c1 + n;
        break;
      case new_sep:
        a1 = new double[n];
        b1 = new double[n];
        c1 = new double[n];
        d1 = new double[n];
        break;
    }

    for (int i = 0; i < n; i++) {
        a1[i] = 1.0;
        d1[i] = 1.0;
        c1[i] = 1.0;
        b1[i] = 1.0;
    }
}

void ff(int cont)
{
    switch(cont){
      case new_sep:
        delete[] b1;
        delete[] c1;
        delete[] d1;
      case new_cont:
        delete[] a1;
    }
}

double plain(int n, int m, int cont, int loops)
{
#ifndef preallocate_memory
    allo(cont,n);
#endif

#ifdef TBB_TIMING   
    tick_count t0 = tick_count::now();
#else
    clock_t start = clock();
#endif
        
    if (loops == 1) {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++){
                a1[j] += b1[j];
                c1[j] += d1[j];
            }
        }
    } else {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++) {
                a1[j] += b1[j];
            }
            for (int j = 0; j < n; j++) {
                c1[j] += d1[j];
            }
        }
    }
    double ret;

#ifdef TBB_TIMING   
    tick_count t1 = tick_count::now();
    ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
#else
    clock_t end = clock();
    ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
#endif
    
#ifndef preallocate_memory
    ff(cont);
#endif

    return ret;
}


void main()
{   
    freopen("C:\\test.csv", "w", stdout);

    char *s = " ";

    string na[2] ={"new_cont", "new_sep"};

    cout << "n";

    for (int j = 0; j < 2; j++)
        for (int i = 1; i <= 2; i++)
#ifdef preallocate_memory
            cout << s << i << "_loops_" << na[preallocate_memory];
#else
            cout << s << i << "_loops_" << na[j];
#endif
            
    cout << endl;

    long long nmax = 1000000;

#ifdef preallocate_memory
    allo(preallocate_memory, nmax);
#endif
    
    for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
    {
        const long long m = 10000000/n;
        cout << n;

        for (int j = 0; j < 2; j++)
            for (int i = 1; i <= 2; i++)
                cout << s << plain(n, m, j, i);
        cout << endl;
    }
}

mrivanlima



Answer #1

다음은 테스트 코드입니다.

int main(){
    const int n = 100000;

#ifdef ALLOCATE_SEPERATE
    double *a1 = (double*)malloc(n * sizeof(double));
    double *b1 = (double*)malloc(n * sizeof(double));
    double *c1 = (double*)malloc(n * sizeof(double));
    double *d1 = (double*)malloc(n * sizeof(double));
#else
    double *a1 = (double*)malloc(n * sizeof(double) * 4);
    double *b1 = a1 + n;
    double *c1 = b1 + n;
    double *d1 = c1 + n;
#endif

    // 비정규 화 가능성을 방지하기 위해 데이터를 제로화합니다.
    memset(a1,0,n * sizeof(double));
    memset(b1,0,n * sizeof(double));
    memset(c1,0,n * sizeof(double));
    memset(d1,0,n * sizeof(double));

    // 주소 인쇄
    cout << a1 << endl;
    cout << b1 << endl;
    cout << c1 << endl;
    cout << d1 << endl;

    clock_t start = clock();

    int c = 0;
    while (c++ < 10000){

#if ONE_LOOP
        for(int j=0;j<n;j++){
            a1[j] += b1[j];
            c1[j] += d1[j];
        }
#else
        for(int j=0;j<n;j++){
            a1[j] += b1[j];
        }
        for(int j=0;j<n;j++){
            c1[j] += d1[j];
        }
#endif

    }

    clock_t end = clock();
    cout << "seconds = " << (double)(end - start) / CLOCKS_PER_SEC << endl;

    system("pause");
    return 0;
}

2 x Intel Xeon X5482 Harpertown @ 3.2 GHz:

#define ALLOCATE_SEPERATE
#define ONE_LOOP
00600020
006D0020
007A0020
00870020
seconds = 6.206

#define ALLOCATE_SEPERATE
// # ONE_LOOP 정의
005E0020
006B0020
00780020
00850020
seconds = 2.116

// # ALLOCATE_SEPERATE 정의
#define ONE_LOOP
00570020
00633520
006F6A20
007B9F20
seconds = 1.894

// # ALLOCATE_SEPERATE 정의
// # ONE_LOOP 정의
008C0020
00983520
00A46A20
00B09F20
seconds = 1.993