内存拷贝优化(2)-全尺寸拷贝优化
四年前写过一篇小内存拷贝优化:http://www.skywind.me/blog/archives/143 纠结了一下还是把全尺寸拷贝优化代码发布出来吧,没啥好保密的, 如今总结一下全尺寸内存拷贝优化的要点:
- 策略区别:64字节以内用小内存方案,64K以内用中尺寸方案,大于64K用大内存拷贝方案。
- 查表跳转:拷贝不同小尺寸内存,直接跳转到相应地址解除循环。
- 目标对齐:64字节以上拷贝的先用普通方法拷贝几个字节让目标地址对齐,好做后面的事情。
- 矢量拷贝:并行一次性读入N个矢量到 sse2 寄存器,再并行写出。
- 缓存预取:使用 prefetchnta ,提前预取数据,等到真的要用时数据已经到位。
- 内存直写:使用 movntdq 来直写内存,避免缓存污染。
部分理论,见论文:《Using Block Prefetch for Optimized Memory Performance》 但论文考虑问题比较单一,所以实际代码写的比论文复杂不少,目前在各个尺寸上基本平均能够加速 40%,比较GCC 4.9, VS2012的 memcpy,不排除未来的 libc, crt库继续完善以后,能够达到下面代码的速度。但我看libc和crt的 memcpy代码已经很久没人更新了,不知道他们还愿意继续优化下去么? 行了,具体实现各位读代码吧,需要 SSE2 指令集支持,gcc编译时需要 –msse2 一下,点击(more)展开代码,测试结果附在源文件最后注释部分:
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2012
//
// feature:
// 40% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
//
//=====================================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <windows.h>
#include <emmintrin.h>
void *memcpy_tiny(void *dst, const void *src, size_t len)
{
if (len <= 64) {
register unsigned char *dd = (unsigned char*)dst + len;
register const unsigned char *ss = (const unsigned char*)src + len;
switch (len) {
case 68: *((int*)(dd - 68)) = *((int*)(ss - 68));
case 64: *((int*)(dd - 64)) = *((int*)(ss - 64));
case 60: *((int*)(dd - 60)) = *((int*)(ss - 60));
case 56: *((int*)(dd - 56)) = *((int*)(ss - 56));
case 52: *((int*)(dd - 52)) = *((int*)(ss - 52));
case 48: *((int*)(dd - 48)) = *((int*)(ss - 48));
case 44: *((int*)(dd - 44)) = *((int*)(ss - 44));
case 40: *((int*)(dd - 40)) = *((int*)(ss - 40));
case 36: *((int*)(dd - 36)) = *((int*)(ss - 36));
case 32: *((int*)(dd - 32)) = *((int*)(ss - 32));
case 28: *((int*)(dd - 28)) = *((int*)(ss - 28));
case 24: *((int*)(dd - 24)) = *((int*)(ss - 24));
case 20: *((int*)(dd - 20)) = *((int*)(ss - 20));
case 16: *((int*)(dd - 16)) = *((int*)(ss - 16));
case 12: *((int*)(dd - 12)) = *((int*)(ss - 12));
case 8: *((int*)(dd - 8)) = *((int*)(ss - 8));
case 4: *((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 67: *((int*)(dd - 67)) = *((int*)(ss - 67));
case 63: *((int*)(dd - 63)) = *((int*)(ss - 63));
case 59: *((int*)(dd - 59)) = *((int*)(ss - 59));
case 55: *((int*)(dd - 55)) = *((int*)(ss - 55));
case 51: *((int*)(dd - 51)) = *((int*)(ss - 51));
case 47: *((int*)(dd - 47)) = *((int*)(ss - 47));
case 43: *((int*)(dd - 43)) = *((int*)(ss - 43));
case 39: *((int*)(dd - 39)) = *((int*)(ss - 39));
case 35: *((int*)(dd - 35)) = *((int*)(ss - 35));
case 31: *((int*)(dd - 31)) = *((int*)(ss - 31));
case 27: *((int*)(dd - 27)) = *((int*)(ss - 27));
case 23: *((int*)(dd - 23)) = *((int*)(ss - 23));
case 19: *((int*)(dd - 19)) = *((int*)(ss - 19));
case 15: *((int*)(dd - 15)) = *((int*)(ss - 15));
case 11: *((int*)(dd - 11)) = *((int*)(ss - 11));
case 7: *((int*)(dd - 7)) = *((int*)(ss - 7));
*((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 3: *((short*)(dd - 3)) = *((short*)(ss - 3));
dd[-1] = ss[-1];
break;
case 66: *((int*)(dd - 66)) = *((int*)(ss - 66));
case 62: *((int*)(dd - 62)) = *((int*)(ss - 62));
case 58: *((int*)(dd - 58)) = *((int*)(ss - 58));
case 54: *((int*)(dd - 54)) = *((int*)(ss - 54));
case 50: *((int*)(dd - 50)) = *((int*)(ss - 50));
case 46: *((int*)(dd - 46)) = *((int*)(ss - 46));
case 42: *((int*)(dd - 42)) = *((int*)(ss - 42));
case 38: *((int*)(dd - 38)) = *((int*)(ss - 38));
case 34: *((int*)(dd - 34)) = *((int*)(ss - 34));
case 30: *((int*)(dd - 30)) = *((int*)(ss - 30));
case 26: *((int*)(dd - 26)) = *((int*)(ss - 26));
case 22: *((int*)(dd - 22)) = *((int*)(ss - 22));
case 18: *((int*)(dd - 18)) = *((int*)(ss - 18));
case 14: *((int*)(dd - 14)) = *((int*)(ss - 14));
case 10: *((int*)(dd - 10)) = *((int*)(ss - 10));
case 6: *((int*)(dd - 6)) = *((int*)(ss - 6));
case 2: *((short*)(dd - 2)) = *((short*)(ss - 2));
break;
case 65: *((int*)(dd - 65)) = *((int*)(ss - 65));
case 61: *((int*)(dd - 61)) = *((int*)(ss - 61));
case 57: *((int*)(dd - 57)) = *((int*)(ss - 57));
case 53: *((int*)(dd - 53)) = *((int*)(ss - 53));
case 49: *((int*)(dd - 49)) = *((int*)(ss - 49));
case 45: *((int*)(dd - 45)) = *((int*)(ss - 45));
case 41: *((int*)(dd - 41)) = *((int*)(ss - 41));
case 37: *((int*)(dd - 37)) = *((int*)(ss - 37));
case 33: *((int*)(dd - 33)) = *((int*)(ss - 33));
case 29: *((int*)(dd - 29)) = *((int*)(ss - 29));
case 25: *((int*)(dd - 25)) = *((int*)(ss - 25));
case 21: *((int*)(dd - 21)) = *((int*)(ss - 21));
case 17: *((int*)(dd - 17)) = *((int*)(ss - 17));
case 13: *((int*)(dd - 13)) = *((int*)(ss - 13));
case 9: *((int*)(dd - 9)) = *((int*)(ss - 9));
case 5: *((int*)(dd - 5)) = *((int*)(ss - 5));
case 1: dd[-1] = ss[-1];
break;
case 0:
default: break;
}
return dd;
} else {
return memcpy(dst, src, len);
}
}
void* memcpy_fast(void *destination, const void *source, size_t size)
{
unsigned char *dst = (unsigned char*)destination;
const unsigned char *src = (const unsigned char*)source;
static size_t cachesize = 0x10000;
size_t diff;
// small memory copy
if (size < 64) {
return memcpy_tiny(dst, src, size);
}
// align destination to 16 bytes boundary
diff = (((size_t)dst + 15) & (~15)) - ((size_t)dst);
if (diff > 0) {
memcpy_tiny(dst, src, diff);
dst += diff;
src += diff;
size -= diff;
}
// medium size copy
if (size <= cachesize) {
__m128i c1, c2, c3, c4;
if ((((size_t)src) & 15) == 0) { // source aligned
for (; size >= 64; size -= 64) {
c1 = _mm_load_si128(((const __m128i*)src) + 0);
c2 = _mm_load_si128(((const __m128i*)src) + 1);
c3 = _mm_load_si128(((const __m128i*)src) + 2);
c4 = _mm_load_si128(((const __m128i*)src) + 3);
src += 64;
_mm_store_si128((((__m128i*)dst) + 0), c1);
_mm_store_si128((((__m128i*)dst) + 1), c2);
_mm_store_si128((((__m128i*)dst) + 2), c3);
_mm_store_si128((((__m128i*)dst) + 3), c4);
dst += 64;
}
}
else { // source un-aligned
for (; size >= 64; size -= 64) {
c1 = _mm_loadu_si128(((const __m128i*)src) + 0);
c2 = _mm_loadu_si128(((const __m128i*)src) + 1);
c3 = _mm_loadu_si128(((const __m128i*)src) + 2);
c4 = _mm_loadu_si128(((const __m128i*)src) + 3);
src += 64;
_mm_store_si128((((__m128i*)dst) + 0), c1);
_mm_store_si128((((__m128i*)dst) + 1), c2);
_mm_store_si128((((__m128i*)dst) + 2), c3);
_mm_store_si128((((__m128i*)dst) + 3), c4);
dst += 64;
}
}
}
else { // big memory copy
__m128i c1, c2, c3, c4;
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
if ((((size_t)src) & 15) == 0) { // source aligned
for (; size >= 64; size -= 64) {
_mm_prefetch((const char*)(src + 128), _MM_HINT_NTA);
c1 = _mm_load_si128(((const __m128i*)src) + 0);
c2 = _mm_load_si128(((const __m128i*)src) + 1);
c3 = _mm_load_si128(((const __m128i*)src) + 2);
c4 = _mm_load_si128(((const __m128i*)src) + 3);
src += 64;
_mm_stream_si128((((__m128i*)dst) + 0), c1);
_mm_stream_si128((((__m128i*)dst) + 1), c2);
_mm_stream_si128((((__m128i*)dst) + 2), c3);
_mm_stream_si128((((__m128i*)dst) + 3), c4);
dst += 64;
}
}
else { // source unaligned
for (; size >= 64; size -= 64) {
_mm_prefetch((const char*)(src + 128), _MM_HINT_NTA);
c1 = _mm_loadu_si128(((const __m128i*)src) + 0);
c2 = _mm_loadu_si128(((const __m128i*)src) + 1);
c3 = _mm_loadu_si128(((const __m128i*)src) + 2);
c4 = _mm_loadu_si128(((const __m128i*)src) + 3);
src += 64;
_mm_stream_si128((((__m128i*)dst) + 0), c1);
_mm_stream_si128((((__m128i*)dst) + 1), c2);
_mm_stream_si128((((__m128i*)dst) + 2), c3);
_mm_stream_si128((((__m128i*)dst) + 3), c4);
dst += 64;
}
}
_mm_sfence();
}
return memcpy_tiny(dst, src, size);
}
void benchmark(int dstalign, int srcalign, size_t size, int times)
{
char *DATA1 = (char*)malloc(size + 64);
char *DATA2 = (char*)malloc(size + 64);
size_t LINEAR1 = ((size_t)DATA1);
size_t LINEAR2 = ((size_t)DATA2);
char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
DWORD t1, t2;
int k;
Sleep(100);
t1 = timeGetTime();
for (k = times; k > 0; k--) {
memcpy(dst, src, size);
}
t1 = timeGetTime() - t1;
Sleep(100);
t2 = timeGetTime();
for (k = times; k > 0; k--) {
memcpy_fast(dst, src, size);
}
t2 = timeGetTime() - t2;
free(DATA1);
free(DATA2);
printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",
dstalign? "aligned" : "unalign",
srcalign? "aligned" : "unalign", (int)t2, (int)t1);
}
void bench(int copysize, int times)
{
printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
benchmark(1, 1, copysize, times);
benchmark(1, 0, copysize, times);
benchmark(0, 1, copysize, times);
benchmark(0, 0, copysize, times);
printf("\n");
}
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
int main(void)
{
bench(32, 0x1000000);
bench(64, 0x1000000);
bench(512, 0x800000);
bench(1024, 0x400000);
bench(4096, 0x80000);
bench(8192, 0x40000);
bench(1024 * 1024 * 1, 0x800);
bench(1024 * 1024 * 4, 0x200);
bench(1024 * 1024 * 8, 0x100);
return 0;
}
/*
result: gcc4.9 (msvc 2012 got a similar result):
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_sse2=180ms memcpy=249 ms
result(dst aligned, src unalign): memcpy_sse2=170ms memcpy=271 ms
result(dst unalign, src aligned): memcpy_sse2=179ms memcpy=269 ms
result(dst unalign, src unalign): memcpy_sse2=180ms memcpy=260 ms
benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_sse2=162ms memcpy=300 ms
result(dst aligned, src unalign): memcpy_sse2=199ms memcpy=328 ms
result(dst unalign, src aligned): memcpy_sse2=410ms memcpy=339 ms
result(dst unalign, src unalign): memcpy_sse2=390ms memcpy=361 ms
benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_sse2=160ms memcpy=241 ms
result(dst aligned, src unalign): memcpy_sse2=200ms memcpy=519 ms
result(dst unalign, src aligned): memcpy_sse2=313ms memcpy=509 ms
result(dst unalign, src unalign): memcpy_sse2=311ms memcpy=520 ms
benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_sse2=145ms memcpy=179 ms
result(dst aligned, src unalign): memcpy_sse2=180ms memcpy=430 ms
result(dst unalign, src aligned): memcpy_sse2=245ms memcpy=430 ms
result(dst unalign, src unalign): memcpy_sse2=230ms memcpy=455 ms
benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_sse2=80ms memcpy=80 ms
result(dst aligned, src unalign): memcpy_sse2=110ms memcpy=205 ms
result(dst unalign, src aligned): memcpy_sse2=110ms memcpy=224 ms
result(dst unalign, src unalign): memcpy_sse2=110ms memcpy=200 ms
benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_sse2=70ms memcpy=78 ms
result(dst aligned, src unalign): memcpy_sse2=100ms memcpy=222 ms
result(dst unalign, src aligned): memcpy_sse2=100ms memcpy=210 ms
result(dst unalign, src unalign): memcpy_sse2=100ms memcpy=230 ms
benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_sse2=200ms memcpy=201 ms
result(dst aligned, src unalign): memcpy_sse2=260ms memcpy=270 ms
result(dst unalign, src aligned): memcpy_sse2=263ms memcpy=361 ms
result(dst unalign, src unalign): memcpy_sse2=267ms memcpy=321 ms
benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_sse2=281ms memcpy=391 ms
result(dst aligned, src unalign): memcpy_sse2=265ms memcpy=407 ms
result(dst unalign, src aligned): memcpy_sse2=313ms memcpy=453 ms
result(dst unalign, src unalign): memcpy_sse2=282ms memcpy=439 ms
benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_sse2=266ms memcpy=422 ms
result(dst aligned, src unalign): memcpy_sse2=250ms memcpy=407 ms
result(dst unalign, src aligned): memcpy_sse2=297ms memcpy=516 ms
result(dst unalign, src unalign): memcpy_sse2=281ms memcpy=436 ms
*/