相信大家代码里有很多地方用到memcpy这个函数,相信这个函数的占用是不小的,有时优化了memcpy,能使整个项目的运行效率提升。通过适当的编码技巧,让我们的内存拷贝速度超过memcpy两倍,是可以实现的。 有人说memcpy还能优化么?不就是rep movsd么?CPU和内存之间的带宽是固定的,怎么可能优化呢?其实是普通的内存拷贝并没有发挥全部的带宽,很多被浪费掉了,比如要等到数据完全读取成功后再去写入,然后要写入成功后再去读取新的。而优化本身就是使这两者尽量的并行,发挥最大的带宽。

现代的内存拷贝都需要判断内存大小,并按照大小选择不同策略进行拷贝,比如大内存拷贝(超过cache大小),那么最好使用并行若干读取指令和写入指令,然后再并行写入,使得CPU前后结果依赖得以大大降低,并且使用缓冲预取,再CPU处理数据之前,就把数据放到离CPU最近的CACHE。这样已经可以比memcpy快很多了,如果再加上一些新指令的帮助,大内存拷贝会大大提速。

但是用同样的代码去拷贝小内存,因为额外的开销,难对齐的内存,准备工作一大堆,如果实际要拷贝的内存很小的话,这样的工作开销还比直接按照 dword复制慢很多。在VC10的memcpy实现中将内存按照128个字节大小进行区分,小于该大小的使用普通拷贝,大于该大小的使用普通SSE指令拷贝,而现在我们就要来挑战VC10的memcpy,本篇先叙述小内存拷贝部分。

适合拷贝64字节以内的数据量。原理很简单,LOOP UNROLL。rep movsb/movsd是靠不住的,小内存拷贝还是得展开循环。

废话不多说,代码贴上:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <mmsystem.h>

#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif

void *memcpy_tiny(void *dst, const void *src, size_t len)
{
	if (len <= 63) {
		register unsigned char *dd = (unsigned char*)dst + len;
		register const unsigned char *ss = (const unsigned char*)src + len;
		switch (len)
		{
		case 68:	*((int*)(dd - 68)) = *((int*)(ss - 68));
		case 64:	*((int*)(dd - 64)) = *((int*)(ss - 64));
		case 60:	*((int*)(dd - 60)) = *((int*)(ss - 60));
		case 56:	*((int*)(dd - 56)) = *((int*)(ss - 56));
		case 52:	*((int*)(dd - 52)) = *((int*)(ss - 52));
		case 48:	*((int*)(dd - 48)) = *((int*)(ss - 48));
		case 44:	*((int*)(dd - 44)) = *((int*)(ss - 44));
		case 40:	*((int*)(dd - 40)) = *((int*)(ss - 40));
		case 36:	*((int*)(dd - 36)) = *((int*)(ss - 36));
		case 32:	*((int*)(dd - 32)) = *((int*)(ss - 32));
		case 28:	*((int*)(dd - 28)) = *((int*)(ss - 28));
		case 24:	*((int*)(dd - 24)) = *((int*)(ss - 24));
		case 20:	*((int*)(dd - 20)) = *((int*)(ss - 20));
		case 16:	*((int*)(dd - 16)) = *((int*)(ss - 16));
		case 12:	*((int*)(dd - 12)) = *((int*)(ss - 12));
		case  8:	*((int*)(dd - 8)) = *((int*)(ss - 8));
		case  4:	*((int*)(dd - 4)) = *((int*)(ss - 4));
					break;
		case 67:    *((int*)(dd - 67)) = *((int*)(ss - 67));
		case 63:    *((int*)(dd - 63)) = *((int*)(ss - 63));
		case 59:    *((int*)(dd - 59)) = *((int*)(ss - 59));
		case 55:    *((int*)(dd - 55)) = *((int*)(ss - 55));
		case 51:    *((int*)(dd - 51)) = *((int*)(ss - 51));
		case 47:    *((int*)(dd - 47)) = *((int*)(ss - 47));
		case 43:    *((int*)(dd - 43)) = *((int*)(ss - 43));
		case 39:    *((int*)(dd - 39)) = *((int*)(ss - 39));
		case 35:    *((int*)(dd - 35)) = *((int*)(ss - 35));
		case 31:    *((int*)(dd - 31)) = *((int*)(ss - 31));
		case 27:    *((int*)(dd - 27)) = *((int*)(ss - 27));
		case 23:    *((int*)(dd - 23)) = *((int*)(ss - 23));
		case 19:    *((int*)(dd - 19)) = *((int*)(ss - 19));
		case 15:    *((int*)(dd - 15)) = *((int*)(ss - 15));
		case 11:    *((int*)(dd - 11)) = *((int*)(ss - 11));
		case  7:    *((int*)(dd - 7)) = *((int*)(ss - 7));
					*((int*)(dd - 4)) = *((int*)(ss - 4));
					break;
		case  3:    *((short*)(dd - 3)) = *((short*)(ss - 3));
					dd[-1] = ss[-1];
					break;
		case 66:    *((int*)(dd - 66)) = *((int*)(ss - 66));
		case 62:    *((int*)(dd - 62)) = *((int*)(ss - 62));
		case 58:    *((int*)(dd - 58)) = *((int*)(ss - 58));
		case 54:    *((int*)(dd - 54)) = *((int*)(ss - 54));
		case 50:    *((int*)(dd - 50)) = *((int*)(ss - 50));
		case 46:    *((int*)(dd - 46)) = *((int*)(ss - 46));
		case 42:    *((int*)(dd - 42)) = *((int*)(ss - 42));
		case 38:    *((int*)(dd - 38)) = *((int*)(ss - 38));
		case 34:    *((int*)(dd - 34)) = *((int*)(ss - 34));
		case 30:    *((int*)(dd - 30)) = *((int*)(ss - 30));
		case 26:    *((int*)(dd - 26)) = *((int*)(ss - 26));
		case 22:    *((int*)(dd - 22)) = *((int*)(ss - 22));
		case 18:    *((int*)(dd - 18)) = *((int*)(ss - 18));
		case 14:    *((int*)(dd - 14)) = *((int*)(ss - 14));
		case 10:    *((int*)(dd - 10)) = *((int*)(ss - 10));
		case  6:    *((int*)(dd - 6)) = *((int*)(ss - 6));
		case  2:    *((short*)(dd - 2)) = *((short*)(ss - 2));
					break;
		case 65:    *((int*)(dd - 65)) = *((int*)(ss - 65));
		case 61:    *((int*)(dd - 61)) = *((int*)(ss - 61));
		case 57:    *((int*)(dd - 57)) = *((int*)(ss - 57));
		case 53:    *((int*)(dd - 53)) = *((int*)(ss - 53));
		case 49:    *((int*)(dd - 49)) = *((int*)(ss - 49));
		case 45:    *((int*)(dd - 45)) = *((int*)(ss - 45));
		case 41:    *((int*)(dd - 41)) = *((int*)(ss - 41));
		case 37:    *((int*)(dd - 37)) = *((int*)(ss - 37));
		case 33:    *((int*)(dd - 33)) = *((int*)(ss - 33));
		case 29:    *((int*)(dd - 29)) = *((int*)(ss - 29));
		case 25:    *((int*)(dd - 25)) = *((int*)(ss - 25));
		case 21:    *((int*)(dd - 21)) = *((int*)(ss - 21));
		case 17:    *((int*)(dd - 17)) = *((int*)(ss - 17));
		case 13:    *((int*)(dd - 13)) = *((int*)(ss - 13));
		case  9:    *((int*)(dd - 9)) = *((int*)(ss - 9));
		case  5:    *((int*)(dd - 5)) = *((int*)(ss - 5));
		case  1:    dd[-1] = ss[-1];
					break;
		case 0:
		default: break;
		}
		return dd;
	}	else {
		return memcpy(dst, src, len);
	}
}

typedef void *(*MemCpyProc)(void *dst, const void *src, size_t size);

void benchmark(MemCpyProc memcpy1, MemCpyProc memcpy2, int x1, int x2)
{
	static unsigned char card1[0x20015];
	static unsigned char card2[0x20015];
	char *align1 = (char*)card1 + ((16 - ((size_t)card1 & 15)) & 15);
	char *align2 = (char*)card2 + ((16 - ((size_t)card2 & 15)) & 15);
	long totalsize = 0x20000000;
	int size, t1, t2, i;
	for (size = 64; size >= 8; size = size * 2 / 3) {
		int times = totalsize / size;
		printf("memcpy: size=%6d\t times=%8d:\t ", size, times);
		Sleep(50);
		t1 = timeGetTime();
		for (i = 0; i < times; i++)
			memcpy1(align1 + x1, align2 + x2, size);
		t1 = timeGetTime() - t1;
		Sleep(50);
		t2 = timeGetTime();
		for (i = 0; i < times; i++)
			memcpy2(align1 + x1, align2 + x2, size);
		t2 = timeGetTime() - t2;
		printf("t1=%4d\t t2=%4d\n", t1, t2);
	}
}

//! flag: -O3
//! exe:
int main(void)
{
	printf("\ndst: aligned  src: aligned\n");
	benchmark(memcpy, memcpy_tiny, 0, 0);
	printf("\ndst: un-aligned  src: aligned\n");
	benchmark(memcpy, memcpy_tiny, 1, 0);
	printf("\ndst: aligned  src: un-aligned\n");
	benchmark(memcpy, memcpy_tiny, 0, 1);
	printf("\ndst: un-aligned  src: un-aligned\n");
	benchmark(memcpy, memcpy_tiny, 3, 1);
	return 0;
}

该代码展开了循环,效率比rep movsd之类的高2-3倍,比memcpy平均快2倍,下面是测试数据,t1代表memcpy, t2代表memcpy_tiny,使用VC2010最大优化模式编译:

dst: aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 239 t2= 243
memcpy: size= 42 times=12782640: t1= 341 t2= 170
memcpy: size= 28 times=19173961: t1= 225 t2= 205
memcpy: size= 18 times=29826161: t1= 363 t2= 232
memcpy: size= 12 times=44739242: t1= 448 t2= 294
memcpy: size= 8 times=67108864: t1= 699 t2= 406

dst: un-aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 255 t2= 256
memcpy: size= 42 times=12782640: t1= 369 t2= 269
memcpy: size= 28 times=19173961: t1= 289 t2= 165
memcpy: size= 18 times=29826161: t1= 421 t2= 203
memcpy: size= 12 times=44739242: t1= 493 t2= 279
memcpy: size= 8 times=67108864: t1= 956 t2= 354

dst: aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 241 t2= 257
memcpy: size= 42 times=12782640: t1= 303 t2= 218
memcpy: size= 28 times=19173961: t1= 258 t2= 181
memcpy: size= 18 times=29826161: t1= 319 t2= 261
memcpy: size= 12 times=44739242: t1= 448 t2= 325
memcpy: size= 8 times=67108864: t1= 646 t2= 401

dst: un-aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 243 t2= 257
memcpy: size= 42 times=12782640: t1= 329 t2= 273
memcpy: size= 28 times=19173961: t1= 276 t2= 193
memcpy: size= 18 times=29826161: t1= 336 t2= 249
memcpy: size= 12 times=44739242: t1= 529 t2= 332
memcpy: size= 8 times=67108864: t1= 915 t2= 368

可见平均在64,42,28,18,12,8等几个大小的测试中,平均速度是memcpy的两倍。主要是依靠跳转表来实现的(VC INLINE ASM不能描述跳转表,因此该部分无法写成inline asm模式,索幸switch /case在大部分C编译器中都是用跳转表来实现的,因此我们可以用switch/case来模拟基于汇编指令跳转表的内存拷贝)。该部分反编译的结果:

    ; 18   : 		switch (len)
      00016	8d 71 ff	 lea	 esi, DWORD PTR [ecx-1]
      00019	03 c1		 add	 eax, ecx
      0001b	83 fe 43	 cmp	 esi, 67			; 00000043H
      0001e	0f 87 fd 01 00
    	00		 ja	 $LN2@memcpy_tin
      00024	ff 24 b5 00 00
    	00 00		 jmp	 DWORD PTR $LN77@memcpy_tin[esi*4]
    $LN70@memcpy_tin:
    ; 19   : 		{
    ; 20   : 		case 68:	*((int*)(dd - 68)) = *((int*)(ss - 68));
      0002b	8b 74 0a bc	 mov	 esi, DWORD PTR [edx+ecx-68]
      0002f	89 70 bc	 mov	 DWORD PTR [eax-68], esi
    $LN69@memcpy_tin:
    ; 21   : 		case 64:	*((int*)(dd - 64)) = *((int*)(ss - 64));
      00032	8b 74 0a c0	 mov	 esi, DWORD PTR [edx+ecx-64]
      00036	89 70 c0	 mov	 DWORD PTR [eax-64], esi
    $LN68@memcpy_tin:
    ; 22   : 		case 60:	*((int*)(dd - 60)) = *((int*)(ss - 60));
      00039	8b 74 0a c4	 mov	 esi, DWORD PTR [edx+ecx-60]
      0003d	89 70 c4	 mov	 DWORD PTR [eax-60], esi
    $LN67@memcpy_tin:
    ; 23   : 		case 56:	*((int*)(dd - 56)) = *((int*)(ss - 56));
      00040	8b 74 0a c8	 mov	 esi, DWORD PTR [edx+ecx-56]
      00044	89 70 c8	 mov	 DWORD PTR [eax-56], esi
    $LN66@memcpy_tin:
    .................................
    $LN77@memcpy_tin:
    ; 105  : 	}
    ; 106  : }
      0022c	00 00 00 00	 DD	 $LN3@memcpy_tin
      00230	00 00 00 00	 DD	 $LN20@memcpy_tin
      00234	00 00 00 00	 DD	 $LN37@memcpy_tin
      00238	00 00 00 00	 DD	 $LN54@memcpy_tin
      0023c	00 00 00 00	 DD	 $LN4@memcpy_tin
      00240	00 00 00 00	 DD	 $LN21@memcpy_tin
      00244	00 00 00 00	 DD	 $LN38@memcpy_tin
      00248	00 00 00 00	 DD	 $LN55@memcpy_tin
      0024c	00 00 00 00	 DD	 $LN5@memcpy_tin
      00250	00 00 00 00	 DD	 $LN22@memcpy_tin
      00254	00 00 00 00	 DD	 $LN39@memcpy_tin
      00258	00 00 00 00	 DD	 $LN56@memcpy_tin
      0025c	00 00 00 00	 DD	 $LN6@memcpy_tin
      00260	00 00 00 00	 DD	 $LN23@memcpy_tin
      00264	00 00 00 00	 DD	 $LN40@memcpy_tin
      00268	00 00 00 00	 DD	 $LN57@memcpy_tin
      0026c	00 00 00 00	 DD	 $LN7@memcpy_tin
      00270	00 00 00 00	 DD	 $LN24@memcpy_tin
      00274	00 00 00 00	 DD	 $LN41@memcpy_tin
      00278	00 00 00 00	 DD	 $LN58@memcpy_tin
      0027c	00 00 00 00	 DD	 $LN8@memcpy_tin
      00280	00 00 00 00	 DD	 $LN25@memcpy_tin
      00284	00 00 00 00	 DD	 $LN42@memcpy_tin
      00288	00 00 00 00	 DD	 $LN59@memcpy_tin

大家发现诀窍没有?好了,其实在大部分情况下,比如脚本处理等,很多是64个字节以内的内存拷贝,所以小内存拷贝的优化是比较实在的,请大家继续关注“内存拷贝优化(2)-大内存拷贝优化”。