内存拷贝优化(1)-小内存拷贝优化
相信大家代码里有很多地方用到memcpy这个函数,相信这个函数的占用是不小的,有时优化了memcpy,能使整个项目的运行效率提升。通过适当的编码技巧,让我们的内存拷贝速度超过memcpy两倍,是可以实现的。 有人说memcpy还能优化么?不就是rep movsd么?CPU和内存之间的带宽是固定的,怎么可能优化呢?其实是普通的内存拷贝并没有发挥全部的带宽,很多被浪费掉了,比如要等到数据完全读取成功后再去写入,然后要写入成功后再去读取新的。而优化本身就是使这两者尽量的并行,发挥最大的带宽。
现代的内存拷贝都需要判断内存大小,并按照大小选择不同策略进行拷贝,比如大内存拷贝(超过cache大小),那么最好使用并行若干读取指令和写入指令,然后再并行写入,使得CPU前后结果依赖得以大大降低,并且使用缓冲预取,再CPU处理数据之前,就把数据放到离CPU最近的CACHE。这样已经可以比memcpy快很多了,如果再加上一些新指令的帮助,大内存拷贝会大大提速。
但是用同样的代码去拷贝小内存,因为额外的开销,难对齐的内存,准备工作一大堆,如果实际要拷贝的内存很小的话,这样的工作开销还比直接按照 dword复制慢很多。在VC10的memcpy实现中将内存按照128个字节大小进行区分,小于该大小的使用普通拷贝,大于该大小的使用普通SSE指令拷贝,而现在我们就要来挑战VC10的memcpy,本篇先叙述小内存拷贝部分。
适合拷贝64字节以内的数据量。原理很简单,LOOP UNROLL。rep movsb/movsd是靠不住的,小内存拷贝还是得展开循环。
废话不多说,代码贴上:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <mmsystem.h>
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
void *memcpy_tiny(void *dst, const void *src, size_t len)
{
if (len <= 63) {
register unsigned char *dd = (unsigned char*)dst + len;
register const unsigned char *ss = (const unsigned char*)src + len;
switch (len)
{
case 68: *((int*)(dd - 68)) = *((int*)(ss - 68));
case 64: *((int*)(dd - 64)) = *((int*)(ss - 64));
case 60: *((int*)(dd - 60)) = *((int*)(ss - 60));
case 56: *((int*)(dd - 56)) = *((int*)(ss - 56));
case 52: *((int*)(dd - 52)) = *((int*)(ss - 52));
case 48: *((int*)(dd - 48)) = *((int*)(ss - 48));
case 44: *((int*)(dd - 44)) = *((int*)(ss - 44));
case 40: *((int*)(dd - 40)) = *((int*)(ss - 40));
case 36: *((int*)(dd - 36)) = *((int*)(ss - 36));
case 32: *((int*)(dd - 32)) = *((int*)(ss - 32));
case 28: *((int*)(dd - 28)) = *((int*)(ss - 28));
case 24: *((int*)(dd - 24)) = *((int*)(ss - 24));
case 20: *((int*)(dd - 20)) = *((int*)(ss - 20));
case 16: *((int*)(dd - 16)) = *((int*)(ss - 16));
case 12: *((int*)(dd - 12)) = *((int*)(ss - 12));
case 8: *((int*)(dd - 8)) = *((int*)(ss - 8));
case 4: *((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 67: *((int*)(dd - 67)) = *((int*)(ss - 67));
case 63: *((int*)(dd - 63)) = *((int*)(ss - 63));
case 59: *((int*)(dd - 59)) = *((int*)(ss - 59));
case 55: *((int*)(dd - 55)) = *((int*)(ss - 55));
case 51: *((int*)(dd - 51)) = *((int*)(ss - 51));
case 47: *((int*)(dd - 47)) = *((int*)(ss - 47));
case 43: *((int*)(dd - 43)) = *((int*)(ss - 43));
case 39: *((int*)(dd - 39)) = *((int*)(ss - 39));
case 35: *((int*)(dd - 35)) = *((int*)(ss - 35));
case 31: *((int*)(dd - 31)) = *((int*)(ss - 31));
case 27: *((int*)(dd - 27)) = *((int*)(ss - 27));
case 23: *((int*)(dd - 23)) = *((int*)(ss - 23));
case 19: *((int*)(dd - 19)) = *((int*)(ss - 19));
case 15: *((int*)(dd - 15)) = *((int*)(ss - 15));
case 11: *((int*)(dd - 11)) = *((int*)(ss - 11));
case 7: *((int*)(dd - 7)) = *((int*)(ss - 7));
*((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 3: *((short*)(dd - 3)) = *((short*)(ss - 3));
dd[-1] = ss[-1];
break;
case 66: *((int*)(dd - 66)) = *((int*)(ss - 66));
case 62: *((int*)(dd - 62)) = *((int*)(ss - 62));
case 58: *((int*)(dd - 58)) = *((int*)(ss - 58));
case 54: *((int*)(dd - 54)) = *((int*)(ss - 54));
case 50: *((int*)(dd - 50)) = *((int*)(ss - 50));
case 46: *((int*)(dd - 46)) = *((int*)(ss - 46));
case 42: *((int*)(dd - 42)) = *((int*)(ss - 42));
case 38: *((int*)(dd - 38)) = *((int*)(ss - 38));
case 34: *((int*)(dd - 34)) = *((int*)(ss - 34));
case 30: *((int*)(dd - 30)) = *((int*)(ss - 30));
case 26: *((int*)(dd - 26)) = *((int*)(ss - 26));
case 22: *((int*)(dd - 22)) = *((int*)(ss - 22));
case 18: *((int*)(dd - 18)) = *((int*)(ss - 18));
case 14: *((int*)(dd - 14)) = *((int*)(ss - 14));
case 10: *((int*)(dd - 10)) = *((int*)(ss - 10));
case 6: *((int*)(dd - 6)) = *((int*)(ss - 6));
case 2: *((short*)(dd - 2)) = *((short*)(ss - 2));
break;
case 65: *((int*)(dd - 65)) = *((int*)(ss - 65));
case 61: *((int*)(dd - 61)) = *((int*)(ss - 61));
case 57: *((int*)(dd - 57)) = *((int*)(ss - 57));
case 53: *((int*)(dd - 53)) = *((int*)(ss - 53));
case 49: *((int*)(dd - 49)) = *((int*)(ss - 49));
case 45: *((int*)(dd - 45)) = *((int*)(ss - 45));
case 41: *((int*)(dd - 41)) = *((int*)(ss - 41));
case 37: *((int*)(dd - 37)) = *((int*)(ss - 37));
case 33: *((int*)(dd - 33)) = *((int*)(ss - 33));
case 29: *((int*)(dd - 29)) = *((int*)(ss - 29));
case 25: *((int*)(dd - 25)) = *((int*)(ss - 25));
case 21: *((int*)(dd - 21)) = *((int*)(ss - 21));
case 17: *((int*)(dd - 17)) = *((int*)(ss - 17));
case 13: *((int*)(dd - 13)) = *((int*)(ss - 13));
case 9: *((int*)(dd - 9)) = *((int*)(ss - 9));
case 5: *((int*)(dd - 5)) = *((int*)(ss - 5));
case 1: dd[-1] = ss[-1];
break;
case 0:
default: break;
}
return dd;
} else {
return memcpy(dst, src, len);
}
}
typedef void *(*MemCpyProc)(void *dst, const void *src, size_t size);
void benchmark(MemCpyProc memcpy1, MemCpyProc memcpy2, int x1, int x2)
{
static unsigned char card1[0x20015];
static unsigned char card2[0x20015];
char *align1 = (char*)card1 + ((16 - ((size_t)card1 & 15)) & 15);
char *align2 = (char*)card2 + ((16 - ((size_t)card2 & 15)) & 15);
long totalsize = 0x20000000;
int size, t1, t2, i;
for (size = 64; size >= 8; size = size * 2 / 3) {
int times = totalsize / size;
printf("memcpy: size=%6d\t times=%8d:\t ", size, times);
Sleep(50);
t1 = timeGetTime();
for (i = 0; i < times; i++)
memcpy1(align1 + x1, align2 + x2, size);
t1 = timeGetTime() - t1;
Sleep(50);
t2 = timeGetTime();
for (i = 0; i < times; i++)
memcpy2(align1 + x1, align2 + x2, size);
t2 = timeGetTime() - t2;
printf("t1=%4d\t t2=%4d\n", t1, t2);
}
}
//! flag: -O3
//! exe:
int main(void)
{
printf("\ndst: aligned src: aligned\n");
benchmark(memcpy, memcpy_tiny, 0, 0);
printf("\ndst: un-aligned src: aligned\n");
benchmark(memcpy, memcpy_tiny, 1, 0);
printf("\ndst: aligned src: un-aligned\n");
benchmark(memcpy, memcpy_tiny, 0, 1);
printf("\ndst: un-aligned src: un-aligned\n");
benchmark(memcpy, memcpy_tiny, 3, 1);
return 0;
}
该代码展开了循环,效率比rep movsd之类的高2-3倍,比memcpy平均快2倍,下面是测试数据,t1代表memcpy, t2代表memcpy_tiny,使用VC2010最大优化模式编译:
dst: aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 239 t2= 243
memcpy: size= 42 times=12782640: t1= 341 t2= 170
memcpy: size= 28 times=19173961: t1= 225 t2= 205
memcpy: size= 18 times=29826161: t1= 363 t2= 232
memcpy: size= 12 times=44739242: t1= 448 t2= 294
memcpy: size= 8 times=67108864: t1= 699 t2= 406
dst: un-aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 255 t2= 256
memcpy: size= 42 times=12782640: t1= 369 t2= 269
memcpy: size= 28 times=19173961: t1= 289 t2= 165
memcpy: size= 18 times=29826161: t1= 421 t2= 203
memcpy: size= 12 times=44739242: t1= 493 t2= 279
memcpy: size= 8 times=67108864: t1= 956 t2= 354
dst: aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 241 t2= 257
memcpy: size= 42 times=12782640: t1= 303 t2= 218
memcpy: size= 28 times=19173961: t1= 258 t2= 181
memcpy: size= 18 times=29826161: t1= 319 t2= 261
memcpy: size= 12 times=44739242: t1= 448 t2= 325
memcpy: size= 8 times=67108864: t1= 646 t2= 401
dst: un-aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 243 t2= 257
memcpy: size= 42 times=12782640: t1= 329 t2= 273
memcpy: size= 28 times=19173961: t1= 276 t2= 193
memcpy: size= 18 times=29826161: t1= 336 t2= 249
memcpy: size= 12 times=44739242: t1= 529 t2= 332
memcpy: size= 8 times=67108864: t1= 915 t2= 368
可见平均在64,42,28,18,12,8等几个大小的测试中,平均速度是memcpy的两倍。主要是依靠跳转表来实现的(VC INLINE ASM不能描述跳转表,因此该部分无法写成inline asm模式,索幸switch /case在大部分C编译器中都是用跳转表来实现的,因此我们可以用switch/case来模拟基于汇编指令跳转表的内存拷贝)。该部分反编译的结果:
; 18 : switch (len)
00016 8d 71 ff lea esi, DWORD PTR [ecx-1]
00019 03 c1 add eax, ecx
0001b 83 fe 43 cmp esi, 67 ; 00000043H
0001e 0f 87 fd 01 00
00 ja $LN2@memcpy_tin
00024 ff 24 b5 00 00
00 00 jmp DWORD PTR $LN77@memcpy_tin[esi*4]
$LN70@memcpy_tin:
; 19 : {
; 20 : case 68: *((int*)(dd - 68)) = *((int*)(ss - 68));
0002b 8b 74 0a bc mov esi, DWORD PTR [edx+ecx-68]
0002f 89 70 bc mov DWORD PTR [eax-68], esi
$LN69@memcpy_tin:
; 21 : case 64: *((int*)(dd - 64)) = *((int*)(ss - 64));
00032 8b 74 0a c0 mov esi, DWORD PTR [edx+ecx-64]
00036 89 70 c0 mov DWORD PTR [eax-64], esi
$LN68@memcpy_tin:
; 22 : case 60: *((int*)(dd - 60)) = *((int*)(ss - 60));
00039 8b 74 0a c4 mov esi, DWORD PTR [edx+ecx-60]
0003d 89 70 c4 mov DWORD PTR [eax-60], esi
$LN67@memcpy_tin:
; 23 : case 56: *((int*)(dd - 56)) = *((int*)(ss - 56));
00040 8b 74 0a c8 mov esi, DWORD PTR [edx+ecx-56]
00044 89 70 c8 mov DWORD PTR [eax-56], esi
$LN66@memcpy_tin:
.................................
$LN77@memcpy_tin:
; 105 : }
; 106 : }
0022c 00 00 00 00 DD $LN3@memcpy_tin
00230 00 00 00 00 DD $LN20@memcpy_tin
00234 00 00 00 00 DD $LN37@memcpy_tin
00238 00 00 00 00 DD $LN54@memcpy_tin
0023c 00 00 00 00 DD $LN4@memcpy_tin
00240 00 00 00 00 DD $LN21@memcpy_tin
00244 00 00 00 00 DD $LN38@memcpy_tin
00248 00 00 00 00 DD $LN55@memcpy_tin
0024c 00 00 00 00 DD $LN5@memcpy_tin
00250 00 00 00 00 DD $LN22@memcpy_tin
00254 00 00 00 00 DD $LN39@memcpy_tin
00258 00 00 00 00 DD $LN56@memcpy_tin
0025c 00 00 00 00 DD $LN6@memcpy_tin
00260 00 00 00 00 DD $LN23@memcpy_tin
00264 00 00 00 00 DD $LN40@memcpy_tin
00268 00 00 00 00 DD $LN57@memcpy_tin
0026c 00 00 00 00 DD $LN7@memcpy_tin
00270 00 00 00 00 DD $LN24@memcpy_tin
00274 00 00 00 00 DD $LN41@memcpy_tin
00278 00 00 00 00 DD $LN58@memcpy_tin
0027c 00 00 00 00 DD $LN8@memcpy_tin
00280 00 00 00 00 DD $LN25@memcpy_tin
00284 00 00 00 00 DD $LN42@memcpy_tin
00288 00 00 00 00 DD $LN59@memcpy_tin
大家发现诀窍没有?好了,其实在大部分情况下,比如脚本处理等,很多是64个字节以内的内存拷贝,所以小内存拷贝的优化是比较实在的,请大家继续关注“内存拷贝优化(2)-大内存拷贝优化”。