unit FastcodeCharPos; //Version : 0.2 Preliminary version //Only direct calling supported interface function CharPosFastcodeP3(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodeP4(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodePrescott(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodeXP(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodeOpteron(lnpChr: Char; const lcpStr: AnsiString): Integer; function CharPosFastcodePascal(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodeRTL(Ch : Char; const Str : AnsiString) : Integer; function CharPosFastcodeBlended(Ch : Char; const Str : AnsiString) : Integer; implementation uses Windows, SysUtils, elCPUID, elCodeHook; //Author: John O'Harrow //Date: N/A //Optimized for: P3 //Instructionset(s): N/A //Original Name: CharPosJOH_SSE function CharPosFastcodeP3(Ch : Char; const Str : AnsiString) : Integer; asm TEST EDX, EDX {Str = NIL?} JZ @@NotFound {Yes - Jump} MOV ECX, [EDX-4] {ECX = Length(Str)} CMP ECX, 8 JG @@NotSmall TEST ECX, ECX JZ @@NotFound {Exit if Length = 0} @@Small: CMP AL, [EDX] JZ @Found1 DEC ECX JZ @@NotFound CMP AL, [EDX+1] JZ @Found2 DEC ECX JZ @@NotFound CMP AL, [EDX+2] JZ @Found3 DEC ECX JZ @@NotFound CMP AL, [EDX+3] JZ @Found4 DEC ECX JZ @@NotFound CMP AL, [EDX+4] JZ @Found5 DEC ECX JZ @@NotFound CMP AL, [EDX+5] JZ @Found6 DEC ECX JZ @@NotFound CMP AL, [EDX+6] JZ @Found7 DEC ECX JZ @@NotFound CMP AL, [EDX+7] JZ @Found8 @@NotFound: XOR EAX, EAX RET @Found1: MOV EAX, 1 RET @Found2: MOV EAX, 2 RET @Found3: MOV EAX, 3 RET @Found4: MOV EAX, 4 RET @Found5: MOV EAX, 5 RET @Found6: MOV EAX, 6 RET @Found7: MOV EAX, 7 RET @Found8: MOV EAX, 8 RET @@NotSmall: MOV AH, AL ADD EDX, ECX MOVD MM0, EAX PSHUFW MM0, MM0, 0 PUSH ECX NEG ECX @@First8: MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} JGE @@Last8 @@Align: LEA EAX, [EDX+ECX] AND EAX, 7 SUB ECX, EAX @@Loop: {Loop Unrolled 2X} MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$IFNDEF NoUnroll} JGE @@Last8 MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$ENDIF} JL @@loop @@Last8: PCMPEQB MM0, [EDX-8] POP ECX {Original Length} PMOVMSKB EAX, MM0 TEST EAX, EAX JNZ @@Matched2 EMMS RET {Finished} @@Matched: {Set Result from 1st Match in EcX} POP EDX {Original Length} ADD ECX, EDX @@Matched2: EMMS BSF EDX, EAX LEA EAX, [EDX+ECX-7] end; //Author: John O'Harrow //Date: N/A //Optimized for: P4 //Instructionset(s): N/A //Original Name: CharPosJOH_SSE2 function CharPosFastcodeP4(Ch : Char; const Str : AnsiString) : Integer; asm test edx, edx jz @@NullString mov ecx, [edx-4] push ebx mov ebx, eax cmp ecx, 16 jl @@Small @@NotSmall: mov ah, al {Fill each Byte of XMM1 with AL} movd xmm1, eax pshuflw xmm1, xmm1, 0 pshufd xmm1, xmm1, 0 @@First16: movups xmm0, [edx] {Unaligned} pcmpeqb xmm0, xmm1 {Compare First 16 Characters} pmovmskb eax, xmm0 test eax, eax jnz @@FoundStart {Exit on any Match} cmp ecx, 32 jl @@Medium {If Length(Str) < 32, Check Remainder} @@Align: sub ecx, 16 {Align Block Reads} push ecx mov eax, edx neg eax and eax, 15 add edx, ecx neg ecx add ecx, eax @@Loop: movaps xmm0, [edx+ecx] {Aligned} pcmpeqb xmm0, xmm1 {Compare Next 16 Characters} pmovmskb eax, xmm0 test eax, eax jnz @@Found {Exit on any Match} add ecx, 16 jle @@Loop pop eax {Check Remaining Characters} add edx, 16 add eax, ecx {Count from Last Loop End Position} jmp dword ptr [@@JumpTable2-ecx*4] nop nop @@NullString: xor eax, eax {Result = 0} ret nop @@FoundStart: bsf eax, eax {Get Set Bit} pop ebx inc eax {Set Result} ret nop nop @@Found: pop edx bsf eax, eax {Get Set Bit} add edx, ecx pop ebx lea eax, [eax+edx+1] {Set Result} ret @@Medium: add edx, ecx {End of String} mov eax, 16 {Count from 16} jmp dword ptr [@@JumpTable1-64-ecx*4] nop nop @@Small: add edx, ecx {End of String} xor eax, eax {Count from 0} jmp dword ptr [@@JumpTable1-ecx*4] nop @@JumpTable1: dd @@NotFound, @@01, @@02, @@03, @@04, @@05, @@06, @@07 dd @@08, @@09, @@10, @@11, @@12, @@13, @@14, @@15, @@16 @@JumpTable2: dd @@16, @@15, @@14, @@13, @@12, @@11, @@10, @@09, @@08 dd @@07, @@06, @@05, @@04, @@03, @@02, @@01, @@NotFound @@16: add eax, 1 cmp bl, [edx-16] je @@Done @@15: add eax, 1 cmp bl, [edx-15] je @@Done @@14: add eax, 1 cmp bl, [edx-14] je @@Done @@13: add eax, 1 cmp bl, [edx-13] je @@Done @@12: add eax, 1 cmp bl, [edx-12] je @@Done @@11: add eax, 1 cmp bl, [edx-11] je @@Done @@10: add eax, 1 cmp bl, [edx-10] je @@Done @@09: add eax, 1 cmp bl, [edx-9] je @@Done @@08: add eax, 1 cmp bl, [edx-8] je @@Done @@07: add eax, 1 cmp bl, [edx-7] je @@Done @@06: add eax, 1 cmp bl, [edx-6] je @@Done @@05: add eax, 1 cmp bl, [edx-5] je @@Done @@04: add eax, 1 cmp bl, [edx-4] je @@Done @@03: add eax, 1 cmp bl, [edx-3] je @@Done @@02: add eax, 1 cmp bl, [edx-2] je @@Done @@01: add eax, 1 cmp bl, [edx-1] je @@Done @@NotFound: xor eax, eax pop ebx ret @@Done: pop ebx end; //Author: John O'Harrow //Date: N/A //Optimized for: Prescott //Instructionset(s): N/A //Original Name: CharPosJOH_SSE2 function CharPosFastcodePrescott(Ch : Char; const Str : AnsiString) : Integer; asm test edx, edx jz @@NullString mov ecx, [edx-4] push ebx mov ebx, eax cmp ecx, 16 jl @@Small @@NotSmall: mov ah, al {Fill each Byte of XMM1 with AL} movd xmm1, eax pshuflw xmm1, xmm1, 0 pshufd xmm1, xmm1, 0 @@First16: movups xmm0, [edx] {Unaligned} pcmpeqb xmm0, xmm1 {Compare First 16 Characters} pmovmskb eax, xmm0 test eax, eax jnz @@FoundStart {Exit on any Match} cmp ecx, 32 jl @@Medium {If Length(Str) < 32, Check Remainder} @@Align: sub ecx, 16 {Align Block Reads} push ecx mov eax, edx neg eax and eax, 15 add edx, ecx neg ecx add ecx, eax @@Loop: movaps xmm0, [edx+ecx] {Aligned} pcmpeqb xmm0, xmm1 {Compare Next 16 Characters} pmovmskb eax, xmm0 test eax, eax jnz @@Found {Exit on any Match} add ecx, 16 jle @@Loop pop eax {Check Remaining Characters} add edx, 16 add eax, ecx {Count from Last Loop End Position} jmp dword ptr [@@JumpTable2-ecx*4] nop nop @@NullString: xor eax, eax {Result = 0} ret nop @@FoundStart: bsf eax, eax {Get Set Bit} pop ebx inc eax {Set Result} ret nop nop @@Found: pop edx bsf eax, eax {Get Set Bit} add edx, ecx pop ebx lea eax, [eax+edx+1] {Set Result} ret @@Medium: add edx, ecx {End of String} mov eax, 16 {Count from 16} jmp dword ptr [@@JumpTable1-64-ecx*4] nop nop @@Small: add edx, ecx {End of String} xor eax, eax {Count from 0} jmp dword ptr [@@JumpTable1-ecx*4] nop @@JumpTable1: dd @@NotFound, @@01, @@02, @@03, @@04, @@05, @@06, @@07 dd @@08, @@09, @@10, @@11, @@12, @@13, @@14, @@15, @@16 @@JumpTable2: dd @@16, @@15, @@14, @@13, @@12, @@11, @@10, @@09, @@08 dd @@07, @@06, @@05, @@04, @@03, @@02, @@01, @@NotFound @@16: add eax, 1 cmp bl, [edx-16] je @@Done @@15: add eax, 1 cmp bl, [edx-15] je @@Done @@14: add eax, 1 cmp bl, [edx-14] je @@Done @@13: add eax, 1 cmp bl, [edx-13] je @@Done @@12: add eax, 1 cmp bl, [edx-12] je @@Done @@11: add eax, 1 cmp bl, [edx-11] je @@Done @@10: add eax, 1 cmp bl, [edx-10] je @@Done @@09: add eax, 1 cmp bl, [edx-9] je @@Done @@08: add eax, 1 cmp bl, [edx-8] je @@Done @@07: add eax, 1 cmp bl, [edx-7] je @@Done @@06: add eax, 1 cmp bl, [edx-6] je @@Done @@05: add eax, 1 cmp bl, [edx-5] je @@Done @@04: add eax, 1 cmp bl, [edx-4] je @@Done @@03: add eax, 1 cmp bl, [edx-3] je @@Done @@02: add eax, 1 cmp bl, [edx-2] je @@Done @@01: add eax, 1 cmp bl, [edx-1] je @@Done @@NotFound: xor eax, eax pop ebx ret @@Done: pop ebx end; //Author: John O'Harrow //Date: N/A //Optimized for: XP //Instructionset(s): N/A //Original Name: CharPosJOH_SSE function CharPosFastcodeXP(Ch : Char; const Str : AnsiString) : Integer; asm TEST EDX, EDX {Str = NIL?} JZ @@NotFound {Yes - Jump} MOV ECX, [EDX-4] {ECX = Length(Str)} CMP ECX, 8 JG @@NotSmall TEST ECX, ECX JZ @@NotFound {Exit if Length = 0} @@Small: CMP AL, [EDX] JZ @Found1 DEC ECX JZ @@NotFound CMP AL, [EDX+1] JZ @Found2 DEC ECX JZ @@NotFound CMP AL, [EDX+2] JZ @Found3 DEC ECX JZ @@NotFound CMP AL, [EDX+3] JZ @Found4 DEC ECX JZ @@NotFound CMP AL, [EDX+4] JZ @Found5 DEC ECX JZ @@NotFound CMP AL, [EDX+5] JZ @Found6 DEC ECX JZ @@NotFound CMP AL, [EDX+6] JZ @Found7 DEC ECX JZ @@NotFound CMP AL, [EDX+7] JZ @Found8 @@NotFound: XOR EAX, EAX RET @Found1: MOV EAX, 1 RET @Found2: MOV EAX, 2 RET @Found3: MOV EAX, 3 RET @Found4: MOV EAX, 4 RET @Found5: MOV EAX, 5 RET @Found6: MOV EAX, 6 RET @Found7: MOV EAX, 7 RET @Found8: MOV EAX, 8 RET @@NotSmall: MOV AH, AL ADD EDX, ECX MOVD MM0, EAX PSHUFW MM0, MM0, 0 PUSH ECX NEG ECX @@First8: MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} JGE @@Last8 @@Align: LEA EAX, [EDX+ECX] AND EAX, 7 SUB ECX, EAX @@Loop: {Loop Unrolled 2X} MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$IFNDEF NoUnroll} JGE @@Last8 MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare Next 8 Bytes} PMOVMSKB EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$ENDIF} JL @@loop @@Last8: PCMPEQB MM0, [EDX-8] POP ECX {Original Length} PMOVMSKB EAX, MM0 TEST EAX, EAX JNZ @@Matched2 EMMS RET {Finished} @@Matched: {Set Result from 1st Match in EcX} POP EDX {Original Length} ADD ECX, EDX @@Matched2: EMMS BSF EDX, EAX LEA EAX, [EDX+ECX-7] end; //Author: Les Pawelzcyk //Date: N/A //Optimized for: Opteron //Instructionset(s): N/A //Original Name: CharPosLPSSE2_3 function CharPosFastcodeOpteron(lnpChr: Char; const lcpStr: AnsiString): Integer; register; asm test edx, edx jz @strempty push ebx mov ecx, [edx-4] mov ebx, eax cmp ecx, 16 jl @smlcnt mov ah, al movd xmm1, eax pshuflw xmm1, xmm1, 0 pshufd xmm1, xmm1, 0 movups xmm0, [edx] pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 test eax, eax jnz @found2 cmp ecx, 32 jl @medcnt mov eax, edx not eax sub ecx, 16 and eax, 15 mov [esp-4], ecx add eax, 1 add edx, ecx sub ecx, eax neg ecx @loop: movaps xmm0, [edx+ecx] pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 test eax, eax jnz @found add ecx, 16 js @loop mov eax, ecx lea ecx, [ecx*8+offset @zerocnt-128] add eax, [esp-4] add edx, 16 jmp ecx ud2 @strempty: mov eax, edx ret @found: bsf eax, eax mov edx, [esp-4] lea edx, [edx+ecx+1] pop ebx add eax, edx ret @found2: bsf eax, eax pop ebx add eax, 1 ret @medcnt: add edx, ecx neg ecx lea ecx, [ecx*8+offset @zerocnt+128] mov eax, 16 jmp ecx ud2 @smlcnt: add edx, ecx neg ecx lea ecx, [ecx*8+offset @zerocnt] xor eax, eax jmp ecx ud2 add eax, 1 cmp bl, [edx-16] je @smlfound add eax, 1 cmp bl, [edx-15] je @smlfound add eax, 1 cmp bl, [edx-14] je @smlfound add eax, 1 cmp bl, [edx-13] je @smlfound add eax, 1 cmp bl, [edx-12] je @smlfound add eax, 1 cmp bl, [edx-11] je @smlfound add eax, 1 cmp bl, [edx-10] je @smlfound add eax, 1 cmp bl, [edx-9] je @smlfound add eax, 1 cmp bl, [edx-8] je @smlfound add eax, 1 cmp bl, [edx-7] je @smlfound add eax, 1 cmp bl, [edx-6] je @smlfound add eax, 1 cmp bl, [edx-5] je @smlfound add eax, 1 cmp bl, [edx-4] je @smlfound add eax, 1 cmp bl, [edx-3] je @smlfound add eax, 1 cmp bl, [edx-2] je @smlfound add eax, 1 cmp bl, [edx-1] je @smlfound @zerocnt: xor eax, eax @smlfound: pop ebx end; //Author: Chris Grant //Date: N/A //Optimized for: Pascal //Instructionset(s): N/A //Original Name: CharPosCJGPas3_b function CharPosFastcodePascal(Ch : Char; const Str : AnsiString) : Integer; var Len, L2 : Integer; P1, P2, P3 : PChar; label Finish, Small; label label1, label2, label3, label4, label5, label6, label7, label8, label9, label10, label11, label12, label13, label14, label15, label16; label Found0, Found1, Found2, Found3, Found4, Found5, Found6, Found7, Found8, Found9, Found10, Found11, Found12, Found13, Found14, Found15, Found16; label F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16; begin If Str='' then begin Result := 0; exit; end else begin Len := PInteger(Integer(Str)-4)^; P1 := Pointer(Str); {Point to start of string} P2 := P1 + Len; {Point to end of string} P3 := P2 - 16; {Point to one sweep before end of string this is only used as a reference} if (P1 < P3) then begin while 1 = 1 do begin if Ch = P1^ then goto Found0; if Ch = P1[1] then goto Found1; if Ch = P1[2] then goto Found2; if Ch = P1[3] then goto Found3; if Ch = P1[4] then goto Found4; if Ch = P1[5] then goto Found5; if Ch = P1[6] then goto Found6; if Ch = P1[7] then goto Found7; if Ch = P1[8] then goto Found8; if Ch = P1[9] then goto Found9; if Ch = P1[10] then goto Found10; if Ch = P1[11] then goto Found11; if Ch = P1[12] then goto Found12; if Ch = P1[13] then goto Found13; if Ch = P1[14] then goto Found14; if Ch = P1[15] then goto Found15; Inc(P1, 16); if P1 > P3 then goto Small; end; end else goto Small; Found15: Result := 16 + Len + P1 - P2; exit; Found14: Result := 15 + Len + P1 - P2; exit; Found13: Result := 14 + Len + P1 - P2; exit; Found12: Result := 13 + Len + P1 - P2; exit; Found11: Result := 12 + Len + P1 - P2; exit; Found10: Result := 11 + Len + P1 - P2; exit; Found9: Result := 10 + Len + P1 - P2; exit; Found8: Result := 9 + Len + P1 - P2; exit; Found7: Result := 8 + Len + P1 - P2; exit; Found6: Result := 7 + Len + P1 - P2; exit; Found5: Result := 6 + Len + P1 - P2; exit; Found4: Result := 5 + Len + P1 - P2; exit; Found3: Result := 4 + Len + P1 - P2; exit; Found2: Result := 3 + Len + P1 - P2; exit; Found1: Result := 2 + Len + P1 - P2; exit; Found0: Result := 1 + Len + P1 - P2; exit; Small: L2 := P2 - P1; case L2 of 0: goto Finish; 1: goto label1; 2: goto label2; 3: goto label3; 4: goto label4; 5: goto label5; 6: goto label6; 7: goto label7; 8: goto label8; 9: goto label9; 10: goto label10; 11: goto label11; 12: goto label12; 13: goto label13; 14: goto label14; 15: goto label15; end; label16: if Ch = P1^ then goto F15; label15: if Ch = P1[L2-15] then goto F14; label14: if Ch = P1[L2-14] then goto F13; label13: if Ch = P1[L2-13] then goto F12; label12: if Ch = P1[L2-12] then goto F11; label11: if Ch = P1[L2-11] then goto F10; label10: if Ch = P1[L2-10] then goto F9; label9: if Ch = P1[L2-9] then goto F8; label8: if Ch = P1[L2-8] then goto F7; label7: if Ch = P1[L2-7] then goto F6; label6: if Ch = P1[L2-6] then goto F5; label5: if Ch = P1[L2-5] then goto F4; label4: if Ch = P1[L2-4] then goto F3; label3: if Ch = P1[L2-3] then goto F2; label2: if Ch = P1[L2-2] then goto F1; label1: if Ch = P1[L2-1] then goto F0; Finish: Result := 0; exit; F0: Result := Len; exit; F1: Result := Len - 1; exit; F2: Result := Len - 2; exit; F3: Result := Len - 3; exit; F4: Result := Len - 4; exit; F5: Result := Len - 5; exit; F6: Result := Len - 6; exit; F7: Result := Len - 7; exit; F8: Result := Len - 8; exit; F9: Result := Len - 9; exit; F10: Result := Len - 10; exit; F11: Result := Len - 11; exit; F12: Result := Len - 12; exit; F13: Result := Len - 13; exit; F14: Result := Len - 14; exit; F15: Result := Len - 15; exit; end; end; //Author: Chris Grant //Date: N/A //Optimized for: RTL //Instructionset(s): N/A //Original Name: CharPosCJGPas3_b function CharPosFastcodeRTL(Ch : Char; const Str : AnsiString) : Integer; var Len, L2 : Integer; P1, P2, P3 : PChar; label Finish, Small; label label1, label2, label3, label4, label5, label6, label7, label8, label9, label10, label11, label12, label13, label14, label15, label16; label Found0, Found1, Found2, Found3, Found4, Found5, Found6, Found7, Found8, Found9, Found10, Found11, Found12, Found13, Found14, Found15, Found16; label F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16; begin If Str='' then begin Result := 0; exit; end else begin Len := PInteger(Integer(Str)-4)^; P1 := Pointer(Str); {Point to start of string} P2 := P1 + Len; {Point to end of string} P3 := P2 - 16; {Point to one sweep before end of string this is only used as a reference} if (P1 < P3) then begin while 1 = 1 do begin if Ch = P1^ then goto Found0; if Ch = P1[1] then goto Found1; if Ch = P1[2] then goto Found2; if Ch = P1[3] then goto Found3; if Ch = P1[4] then goto Found4; if Ch = P1[5] then goto Found5; if Ch = P1[6] then goto Found6; if Ch = P1[7] then goto Found7; if Ch = P1[8] then goto Found8; if Ch = P1[9] then goto Found9; if Ch = P1[10] then goto Found10; if Ch = P1[11] then goto Found11; if Ch = P1[12] then goto Found12; if Ch = P1[13] then goto Found13; if Ch = P1[14] then goto Found14; if Ch = P1[15] then goto Found15; Inc(P1, 16); if P1 > P3 then goto Small; end; end else goto Small; Found15: Result := 16 + Len + P1 - P2; exit; Found14: Result := 15 + Len + P1 - P2; exit; Found13: Result := 14 + Len + P1 - P2; exit; Found12: Result := 13 + Len + P1 - P2; exit; Found11: Result := 12 + Len + P1 - P2; exit; Found10: Result := 11 + Len + P1 - P2; exit; Found9: Result := 10 + Len + P1 - P2; exit; Found8: Result := 9 + Len + P1 - P2; exit; Found7: Result := 8 + Len + P1 - P2; exit; Found6: Result := 7 + Len + P1 - P2; exit; Found5: Result := 6 + Len + P1 - P2; exit; Found4: Result := 5 + Len + P1 - P2; exit; Found3: Result := 4 + Len + P1 - P2; exit; Found2: Result := 3 + Len + P1 - P2; exit; Found1: Result := 2 + Len + P1 - P2; exit; Found0: Result := 1 + Len + P1 - P2; exit; Small: L2 := P2 - P1; case L2 of 0: goto Finish; 1: goto label1; 2: goto label2; 3: goto label3; 4: goto label4; 5: goto label5; 6: goto label6; 7: goto label7; 8: goto label8; 9: goto label9; 10: goto label10; 11: goto label11; 12: goto label12; 13: goto label13; 14: goto label14; 15: goto label15; end; label16: if Ch = P1^ then goto F15; label15: if Ch = P1[L2-15] then goto F14; label14: if Ch = P1[L2-14] then goto F13; label13: if Ch = P1[L2-13] then goto F12; label12: if Ch = P1[L2-12] then goto F11; label11: if Ch = P1[L2-11] then goto F10; label10: if Ch = P1[L2-10] then goto F9; label9: if Ch = P1[L2-9] then goto F8; label8: if Ch = P1[L2-8] then goto F7; label7: if Ch = P1[L2-7] then goto F6; label6: if Ch = P1[L2-6] then goto F5; label5: if Ch = P1[L2-5] then goto F4; label4: if Ch = P1[L2-4] then goto F3; label3: if Ch = P1[L2-3] then goto F2; label2: if Ch = P1[L2-2] then goto F1; label1: if Ch = P1[L2-1] then goto F0; Finish: Result := 0; exit; F0: Result := Len; exit; F1: Result := Len - 1; exit; F2: Result := Len - 2; exit; F3: Result := Len - 3; exit; F4: Result := Len - 4; exit; F5: Result := Len - 5; exit; F6: Result := Len - 6; exit; F7: Result := Len - 7; exit; F8: Result := Len - 8; exit; F9: Result := Len - 9; exit; F10: Result := Len - 10; exit; F11: Result := Len - 11; exit; F12: Result := Len - 12; exit; F13: Result := Len - 13; exit; F14: Result := Len - 14; exit; F15: Result := Len - 15; exit; end; end; //Author: John O'Harrow //Date: N/A //Optimized for: Blended //Instructionset(s): N/A //Original Name: CharPosJOH_MMX function CharPosFastcodeBlended(Ch : Char; const Str : AnsiString) : Integer; asm TEST EDX, EDX {Str = NIL?} JZ @@NotFound {Yes - Jump} MOV ECX, [EDX-4] {ECX = Length(Str)} CMP ECX, 8 JG @@NotSmall TEST ECX, ECX JZ @@NotFound {Exit if Length = 0} @@Small: CMP AL, [EDX] JZ @Found1 DEC ECX JZ @@NotFound CMP AL, [EDX+1] JZ @Found2 DEC ECX JZ @@NotFound CMP AL, [EDX+2] JZ @Found3 DEC ECX JZ @@NotFound CMP AL, [EDX+3] JZ @Found4 DEC ECX JZ @@NotFound CMP AL, [EDX+4] JZ @Found5 DEC ECX JZ @@NotFound CMP AL, [EDX+5] JZ @Found6 DEC ECX JZ @@NotFound CMP AL, [EDX+6] JZ @Found7 DEC ECX JZ @@NotFound CMP AL, [EDX+7] JZ @Found8 @@NotFound: XOR EAX, EAX RET @Found1: MOV EAX, 1 RET @Found2: MOV EAX, 2 RET @Found3: MOV EAX, 3 RET @Found4: MOV EAX, 4 RET @Found5: MOV EAX, 5 RET @Found6: MOV EAX, 6 RET @Found7: MOV EAX, 7 RET @Found8: MOV EAX, 8 RET @@NotSmall: {Length(Str) > 8} MOV AH, AL ADD EDX, ECX MOVD MM0, EAX PUNPCKLWD MM0, MM0 PUNPCKLDQ MM0, MM0 PUSH ECX {Save Length} NEG ECX @@First8: MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare All 8 Bytes} PACKSSWB MM1, MM1 {Pack Result into 4 Bytes} MOVD EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} JGE @@Last8 @@Align: {Align to Previous 8 Byte Boundary} LEA EAX, [EDX+ECX] AND EAX, 7 {EAX -> 0 or 4} SUB ECX, EAX @@Loop: MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare All 8 Bytes} PACKSSWB MM1, MM1 {Pack Result into 4 Bytes} MOVD EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$IFNDEF NoUnroll} JGE @@Last8 MOVQ MM1, [EDX+ECX] ADD ECX, 8 PCMPEQB MM1, MM0 {Compare All 8 Bytes} PACKSSWB MM1, MM1 {Pack Result into 4 Bytes} MOVD EAX, MM1 TEST EAX, EAX JNZ @@Matched {Exit on Match at any Position} CMP ECX, -8 {Check if Next Loop would pass String End} {$ENDIF} JL @@Loop @@Last8: MOVQ MM1, [EDX-8] {Position for Last 8 Used Characters} POP EDX {Original Length} PCMPEQB MM1, MM0 {Compare All 8 Bytes} PACKSSWB MM1, MM1 {Pack Result into 4 Bytes} MOVD EAX, MM1 TEST EAX, EAX JNZ @@Matched2 {Exit on Match at any Position} EMMS RET {Finished - Not Found} @@Matched: {Set Result from 1st Match in EDX} POP EDX {Original Length} ADD EDX, ECX @@Matched2: EMMS SUB EDX, 8 {Adjust for Extra ADD ECX,8 in Loop} TEST AL, AL JNZ @@MatchDone {Match at Position 1 or 2} TEST AH, AH JNZ @@Match1 {Match at Position 3 or 4} SHR EAX, 16 TEST AL, AL JNZ @@Match2 {Match at Position 5 or 6} SHR EAX, 8 ADD EDX, 6 JMP @@MatchDone @@Match2: ADD EDX, 4 JMP @@MatchDone @@Match1: SHR EAX, 8 {AL <- AH} ADD EDX, 2 @@MatchDone: XOR EAX, 2 AND EAX, 3 {EAX <- 1 or 2} ADD EAX, EDX end; //////////////////////////////////////////////////////////////////////////////// type TFunc = function (Ch : Char; const Str : AnsiString) : Integer; const NewFuncs: array[TCPUType] of TFunc = ( CharPosFastcodeP3, CharPosFastcodeP4, CharPosFastcodeXP, CharPosFastcodeOpteron, CharPosFastcodeBlended,//CharPosFastcodePrescott, CharPosFastcodeBlended ); function CharPosStub(Ch : Char; const Str : AnsiString) : Integer; begin Result := Pos(Ch, Str); end; var CodeHook: TCodeHook; OldFunc_: TFunc; // OldFunc_ & NewFunc_ have same type against overloads ambiguties OldFunc: Pointer absolute OldFunc_; OldFuncSave: Pointer; OldProtect: DWORD; NewFunc_: TFunc; NewFunc: Pointer absolute NewFunc_; initialization begin /// CharPosStub(#0, ''); OldFunc_ := CharPosStub; OldFuncSave := OldFunc; if not VirtualProtect(OldFunc, 3 + SizeOf(TJump), PAGE_READWRITE, OldProtect) then begin /// RaiseLastOSError; end; Inc(Integer(OldFunc), 3); Assert(PBYTE(OldFunc)^=$E8);// asserts we have a call to _LStrCharPos Inc(Integer(OldFunc)); OldFunc := Pointer(Integer(OldFunc) + SizeOf(Pointer) + PInteger(OldFunc)^); // calculate true address of _LStrCharPos if not VirtualProtect(OldFuncSave, 3 + SizeOf(TJump), OldProtect, nil) then begin /// RaiseLastOSError; end; if not FlushInstructionCache(GetCurrentProcess, OldFuncSave, 3 + SizeOf(TJump)) then begin RaiseLastOSError; end; NewFunc_ := NewFuncs[GetCPUType]; CodeHook := TCodeHook.Create(OldFunc, NewFunc); /// CharPos('', ''); end; end.