FreeBSD Bugzilla – Attachment 181440 Details for
Bug 218203
Implement AVX2 accelerated Fletcher algorithms
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
same benchmark but with 128k checksums
fletcher4.cpp (text/plain), 3.55 KB, created by
Adam Stylinski
on 2017-04-03 19:22:34 UTC
(
hide
)
Description:
same benchmark but with 128k checksums
Filename:
MIME Type:
Creator:
Adam Stylinski
Created:
2017-04-03 19:22:34 UTC
Size:
3.55 KB
patch
obsolete
>#include <stdio.h> >#include <stdlib.h> >#include <stdint.h> >#include <immintrin.h> >#include <chrono> >#include <iostream> > >#define NUMBYTES (1 << 17) > >void fletcher_4_nativeavx(const void *buf, uint64_t size, uint64_t *chksum) >{ > const uint32_t *ip = (uint32_t*)buf; > const uint32_t *ipend = ip + (size / sizeof (uint32_t)); > > /* Initially the vector registers to zero for the checksums */ > __m256i a = _mm256_setzero_si256(); > __m256i b = _mm256_setzero_si256(); > __m256i c = _mm256_setzero_si256(); > __m256i d = _mm256_setzero_si256(); > > __m256i data; > > uint64_t aSum, bSum, cSum, dSum; > uint64_t *aVec, *bVec, *cVec, *dVec; > > /* Transform scalar loop into vector one. Don't have to worry > * about peeling off remainder, as the loop termination criteria > * for fletcher 4 ends abruptly when the size isn't a modulus > * of 16 bytes */ > for (; ip < ipend; ip+=4) { > data = _mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)ip)); > a = _mm256_add_epi64(a, data); > b = _mm256_add_epi64(b, a); > c = _mm256_add_epi64(b, c); > d = _mm256_add_epi64(c, d); > } > > /* From the following intel whitepaper: > * https://software.intel.com/en-us/articles/fast-computation-of-fletcher-checksums > * They found the recurrence relation already, evidently > */ > > /* Since there's no hadd for epi64, we'll see what the compiler emits */ > // This might be an overflow danger, as add_epi64 does signed integer math > aVec = (uint64_t*)&a; > bVec = (uint64_t*)&b; > cVec = (uint64_t*)&c; > dVec = (uint64_t*)&d; > > aSum = aVec[0] + aVec[1] + aVec[2] + aVec[3]; > bSum = -aVec[1] - 2 * aVec[2] - 3 * aVec[3] + > 4 * (bVec[0] + bVec[1] + bVec[2] + bVec[3]); > cSum = aVec[2] + 3 * aVec[3] > - 6 * bVec[0] - 10 * bVec[1] - 14 * bVec[2] - 18 * bVec[3] > + 16 * (cVec[0] + cVec[1] + cVec[2] + cVec[3]); > dSum = -aVec[3] > + 4*bVec[0] + 10*bVec[1] + 20*bVec[2] + 34*bVec[3] > - 48*cVec[0] - 64*cVec[1] - 80*cVec[2] - 96*cVec[3] > + 64 * (dVec[0] + dVec[1] +dVec[2] + dVec[3]); > > chksum[0] = aSum; > chksum[1] = bSum; > chksum[2] = cSum; > chksum[3] = dSum; >} > > >void fletcher_4_native(const void *buf, uint64_t size, uint64_t *chksum) >{ > const uint32_t *ip = (uint32_t*)buf; > const uint32_t *ipend = ip + (size / sizeof (uint32_t)); > uint64_t a, b, c, d; > > for (a = b = c = d = 0; ip < ipend; ip++) { > a += ip[0]; > b += a; > c += b; > d += c; > } > > chksum[0] = a; > chksum[1] = b; > chksum[2] = c; > chksum[3] = d; >} > >int main(void) >{ > char *buf = (char*)malloc(NUMBYTES); > FILE *randFile = fopen("/dev/urandom", "r"); > fread(buf, sizeof(char), NUMBYTES, randFile); > uint64_t chksum[4]; > uint64_t chksum2[4]; > auto start = std::chrono::steady_clock::now(); > fletcher_4_nativeavx(buf, NUMBYTES, chksum2); > auto end = std::chrono::steady_clock::now(); > auto diff = end - start; > std::cout << > std::chrono::duration<double, std::milli>(diff).count() << > " ms total (avx)" << std::endl; > > start = std::chrono::steady_clock::now(); > fletcher_4_native(buf, NUMBYTES, chksum); > end = std::chrono::steady_clock::now(); > diff = end - start; > std::cout << > std::chrono::duration<double, std::milli>(diff).count() << > " ms total (scalar)" << std::endl; > > printf("a = %lu, b = %lu, c = %lu, d = %lu\n", chksum[0], chksum[1], chksum[2], chksum[3]); > printf("a = %lu, b = %lu, c = %lu, d = %lu\n", chksum2[0], chksum2[1], chksum2[2], chksum2[3]); > > return 0; >}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 218203
:
181322
|
181326
| 181440