This file can be included to 64bit programs written in Euro Assembler.
It contains OS-independent macros for operations with zero-terminated (ASCIIZ)
strings in ANSI or WIDE (Unicode) encoding.
Unicode string must always be word aligned and terminated with
the zero UNICHAR (word).
Macros may crash the process when the input string is not properly zero-terminated and the following memory is not available for reading.
All functions expect zeroed direction flag on input and they do not change it.
ANSI or WIDE functionality is selected by the current
EUROASM UNICODE= boolean option. Its value is available in system variable
%^UNICODE.
Similar macros with identical names for different program width are defined in string16.htm and string32.htm.
string64 HEAD
%^UNICODE at invocation time.SIZE# SomeText is 8, memory contains 4800_6900_2100_0000h
EUROASM UNICODE=YES
GetLength$ SomeText ; RCX is now 6 (3 nonzero UNICHARS).
EUROASM UNICODE=NO
GetLength$ SomeText ; RCX is now 1 (1 nonzero BYTE).
GetLength$ %MACRO String, Unicode=%^UNICODE
PUSHQ %String
%IF %Unicode
CALL GetLength$W@RT::
GetLength$W@RT:: PROC1
PUSH RAX,RDI
SUB EAX,EAX
SUB ECX,ECX
MOV RDI,[RSP+24] ; Pointer to String$.
DEC RCX
REPNE SCASW
NOT RCX
DEC RCX
SHL RCX,1
POP RDI,RAX
RET 1*8
ENDPROC1 GetLength$W@RT::
%ELSE ; Not %Unicode.
CALL GetLength$A@RT::
GetLength$A@RT:: PROC1
PUSH RAX,RDI
SUB EAX,EAX
SUB ECX,ECX
MOV RDI,[RSP+24] ; Pointer to String$.
DEC RCX
REPNE SCASB
NOT RCX
DEC RCX
POP RDI,RAX
RET 1*8
ENDPROC1 GetLength$A@RT::
%ENDIF
%ENDMACRO GetLength$
SIZE# %Destination.
%^UNICODE at invocation time.
Concat$ %MACRO Destination, Source1,Source2,,, Size=, Unicode=%^UNICODE
%IF %# < 2 ; >>
%ERROR ID=5930, 'Missing operand of macro "Concat$".'
%EXITMACRO Concat$
%ENDIF
PUSH RBP ; Variable number of arguments uses a special calling convention.
MOV RBP,RSP ; Store stack pointer.
ArgNr %FOR %#..2, STEP= -1
PUSHQ %*{%ArgNr} ; All Source pointers, starting with the last.
%ENDFOR ArgNr
PUSHQ %# - 1 ; Number of pushed Source strings.
%IF "%Size" === ""
PUSHQ SIZE# %Destination
%ELSE
PUSHQ %Size
%ENDIF
PUSHQ %Destination
%IF %Unicode
CALL Concat$W@RT:
Concat$W@RT: PROC1
PUSH RAX,RCX,RDX,RBP,RSI,RDI
MOV RDI,[RSP+56] ; %Destination.
MOV RDX,[RSP+64] ; %Size.
MOV RCX,[RSP+72] ; Number of Source strings.
LEA RBP,[RSP+80] ; Variable pointer to 1st sourcestring.
LEA RDX,[RDI+RDX-2] ; End of allocated Destination.
XOR EAX,EAX
.20: MOV RSI,[RBP] ; Pointer to %Source.
.30: LODSW
TEST EAX ; Check if it's end of source string.
JZ .40:
CMP RDI,RDX ; Check if it's end of destination string.
CMC
JC .80: ; If destination size overflowed.
STOSW
JMP .30:
.40: ADD RBP,8 ; The next Source pointer on stack frame.
LOOP .20:
.80: XOR EAX,EAX ; Finally zero-terminate the destination.
STOSW
POP RDI,RSI,RBP,RDX,RCX,RAX
RET ; CF=overflow
ENDPROC1 Concat$W@RT:
%ELSE ; If not %Unicode.
CALL Concat$A@RT:
Concat$A@RT: PROC1
PUSH RAX,RCX,RDX,RBP,RSI,RDI
MOV RDI,[RSP+56] ; %Destination.
MOV RDX,[RSP+64] ; %Size.
MOV RCX,[RSP+72] ; Number of Source strings.
LEA RBP,[RSP+80] ; Variable pointer to 1st sourcestring.
LEA RDX,[RDI+RDX-1] ; End of the allocated Destination.
XOR EAX,EAX
.20: MOV RSI,[RBP] ; Pointer to %Source.
.30: LODSB
TEST EAX ; Check if it's end of source string.
JZ .40:
CMP RDI,RDX ; Check if it's end of destination string.
CMC
JC .80: ; If destination size overflowed.
STOSB
JMP .30:
.40: ADD RBP,8 ; The next Source pointer on stack frame.
LOOP .20:
.80: XOR EAX,EAX ; Finally zero-terminate the destination.
STOSB
POP RDI,RSI,RBP,RDX,RCX,RAX
RET ; CF=overflow
ENDPROC1 Concat$A@RT:
%ENDIF
MOV RSP,RBP ; Restore the stack.
POP RBP
%ENDMACRO Concat$
%^UNICODE at invocation time.
Compare$ %MACRO String1, String2, Unicode=%^UNICODE
%IF "%String2" === ""
PUSHQ RDI
%ELSE
PUSHQ %String2
%ENDIF
%IF "%String1" === ""
PUSHQ RSI
%ELSE
PUSHQ %String1
%ENDIF
%IF %Unicode
CALL Compare$W@RT:
Compare$W@RT: PROC1
PUSH RAX,RCX,RDX,RBX,RSI,RDI
SUB EAX,EAX
SUB ECX,ECX
MOV RDI,[RSP+64] ; %String2.
DEC RCX
MOV RBX,RDI
REPNE:SCASW ; Search for the terminator.
SUB RDI,RBX ; Size of String2 in bytes including the UNICHAR NUL.
MOV RDX,RDI
MOV RDI,[RSP+56] ; %String1.
MOV RSI,RDI
REPNE:SCASW ; Search for the terminator.
MOV RCX,RDI
SUB RCX,RSI ; Size of %String1 in bytes including the UNICHAR NUL.
CMP RCX,RDX ; Compare string sizes.
JNE .90 ; If sizes do not match.
MOV RDI,RBX ; String2.
REPE CMPSB
.90:POP RDI,RSI,RBX,RDX,RCX,RAX
RET 2*8
ENDPROC1 Compare$W@RT:
%ELSE ; If not %Unicode.
CALL Compare$A@RT:
Compare$A@RT: PROC1
PUSH RAX,RCX,RDX,RBX,RSI,RDI
SUB EAX,EAX
SUB ECX,ECX
MOV RDI,[RSP+64] ; %String2.
DEC RCX
MOV RBX,RDI
REPNE:SCASB ; Search for the terminator.
SUB RDI,RBX ; Size of String2 in bytes including the UNICHAR NUL.
MOV RDX,RDI
MOV RDI,[RSP+56] ; %String1.
MOV RSI,RDI
REPNE:SCASB ; Search for the terminator.
MOV RCX,RDI
SUB RCX,RSI ; Size of %String1 in bytes including the UNICHAR NUL.
CMP RCX,RDX ; Compare string sizes.
JNE .90 ; If sizes do not match.
MOV RDI,RBX ; String2.
REPE CMPSB
.90:POP RDI,RSI,RBX,RDX,RCX,RAX
RET 2*8
ENDPROC1 Compare$A@RT:
%ENDIF
%ENDMACRO Compare$
Macro DecodeUTF8 converts Source UTF-8 string to UTF-16 or UTF-32 string.
Source string is either zero-terminated, or its Size= must be specified.
Conversion stops at NUL byte, which is not converted to output.
Input never reads beyond Source+Size.
If Byte Order Mark (BOM, 0xEF,0xBB,0xBF
) is detected at the beginning of the Source string, it is ignored.
Invalid UTF-8 sequence will send a replacement character 0xFFFD
� to the output.
Byte order in output encoding is always LittleEndian, the same which is used in MS Windows WIDE functions.
If you want to produce UTF-16BE, performXCHG AL,AHin CallbackProc.
If you want to produce UTF-32BE, performBSWAP EAXin CallbackProc.
If you want to prefix the output string with BOM, store it to destination buffer before invoking DecodeUTF8.
If you don't like replacement characters (usually displayed as little squares �), filter them out in CallbackProc.
0xD800..0xDFFF when the input UTF-8 character
belongs to Unicode supplementary planes (Emoji, Asian characters etc).
0x0000_FFFD when the input UTF-8 string is malformed.
DecodeUTF8 %MACRO Source, CallbackProc, Size=-1, Width=16
%IF %Width != 16 && %Width != 32
%ERROR ID=5932,'Macro "DecodeUTF8" requires Width=16 or Width=32.'
%EXITMACRO DecodeUTF8
%ENDIF
PUSHQ %Width, %Size, %CallbackProc, %Source
CALL DecodeUTF8@RT::
DecodeUTF8@RT:: PROC1
PUSH RAX,RCX,RDX,RBX,RSI,RDI
SUB ECX,ECX
MOV [RSP+4*8],RCX ; Initialize %ReturnRCX to 0.
MOV RDI,[RSP+7*8] ; %Source.
MOV RCX,[RSP+9*8] ; %Size.
MOV RSI,RDI
MOV RAX,RCX
INC RAX
JZ .Scan: ; If Size=-1, RAX=0 and the Source size will be scanned.
LEA RDI,[RSI+RCX] ; Otherwise use the explicit %Size.
JMP .No0:
.Scan:REPNE:SCASB
JNE .No0:
DEC RDI ; Omit the terminator from conversion.
.No0: ; Source string without NUL is now at RSI..RDI.
BOM %FOR 0xEF,0xBB,0xBF ; Little-Endian BOM (0xFEFF) encoded in UTF-8.
CMP RSI,RDI
JNB .NoBOM:
LODSB
CMP AL,%BOM
JNE .NoBOM:
%ENDFOR BOM
JMP .Start: ; BOM was detected, ESI is advanced just behind it.
.NoBOM:MOV RSI,[RSP+7*8] ; No BOM detected, restore ESI to the start of Source again.
.Start:CMP RSI,RDI
JNB .End:
XOR EBX,EBX
LODSB
MOV BL,AL
NOT BL
BSR RCX,RBX ; Scan bits 7..0 of the inverted first byte of UTF-8 character. CL=0..7.
MOV BL,AL ; First byte of 1,2,3,4 bytes long UTF-8 character (not inverted).
MOV DL,0x7F ; Prepare mask for payload bits in the 1st UTF-8 byte.
SUB RCX,7 ; RCX=7,5,4,3 change to RCX=0,-2,-3,-4.
JZ .Out: ; When RBX is a codepoint 0..0x7F (7bit ASCII character).
NEG RCX ; RCX=2,3,4 (number of bytes in UTF-8 character).
SHR DL,CL ; DL=0x1F,0x0F,0x07 is payload mask.
AND BL,DL ; EBX will accumulate payload bits of codepoint.
CMP CL,2
JB .Bad:
CMP CL,4
JBE .Good:
.Bad: MOV EAX,0xFFFD ; Invalid UTF-8 detected, output the replacement.
JMP .NoSg:
.Good:DEC RCX ; RCX=1, 2 or 3 continuation bytes expected.
LEA RAX,[RSI+RCX]
CMP RAX,RDI ; Check if there's that many input bytes left.
JBE .Cont:
DEC RSI ; Rollback, the last UTF-8 character is incomplete.
SUB RDI,RSI ; RDI characters (1..3) were not decoded.
MOV [RSP+4*8],RDI ; %ReturnRCX.
JMP .End: ; CF=0.
.Cont:LODSB ; Continuation byte AL=10xxxxxxb expected.
BTR RAX,7 ; Reset the marker bit 7.
JNC .Bad:
BTR RAX,6
JC .Bad:
SHL RBX,6 ; Make room in RBX for the next 6 bits.
OR BL,AL ; Accumulate them.
DEC RCX
JNZ .Cont:
.Out: MOV RAX,RBX ; EAX=EBX is now the decoded codepoint 0..0x10_FFFF.
; Check for overlong encodings. DL=0x7F,0x1F,0x0F,0x07 for 1,2,3,4 bytes in UTF-8 character.
CMP EBX,0x01_0000 ; Codepoint 0x01_0000..0x10_FFFF should be encoded in 4 bytes.
JAE .NoOverlong:
CMP EBX,0x00_0800 ; Codepoint 0x00_0800..0x00_FFFF should be encoded in 3 bytes.
JB .2Bts:
CMP DL,0x0F
JE .NoOverlong:
JMP .Bad:
.2Bts:CMP EBX,0x00_0080 ; Codepoint 0x00_0080..0x00_07FF should be encoded in 2 bytes.
JB .1Bts:
CMP DL,0x1F
JE .NoOverlong:
JMP .Bad:
.1Bts:CMP DL,0x7F ; Codepoint 0x00_0000..0x00_007F should be encoded in 1 byte.
JE .NoOverlong:
TEST EBX
JNZ .Bad:
CMP DL,0x1F ; Exception: codepoint 0 may be encoded in 1 or 2 bytes.
JNE .Bad:
.NoOverlong:
SHR EBX,11 ; Check for surrogate codepoints.
CMP BL,0x1B
JE .Bad: ; Do not accept surrogates 0xD800..0xDFFF from input.
TEST BX,0x3E0
JZ .NoSg: ; If codepoint EAX is below 0x0001_0000, surrogates do not apply.
CMPB [RSP+10*8],16 ; Output UTF %Width (16 or 32).
JNE .NoSg: ; UTF-32 does not need surrogates.
SUB RAX,0x0001_0000 ; Codepoint RAX was not encodable in one UTF-16 character.
MOV EBX,0x0000_03FF ; Use two surrogate Unichars.
AND EBX,EAX
SHR RAX,10
ADD EBX,0x0000_DC00 ; EBX is now the low surrogate.
ADD EAX,0x0000_D800 ; EAX is now the high surrogate.
CALL .OutEAX: ; The high surrogate first.
MOV RAX,RBX ; The low surrogate.
JC .End: ; If aborted by CallbackProc.
.NoSg:CALL .OutEAX: ; The low surrogate or BMP codepoint or UTF-32.
JNC .Start: ; Parse the next UTF-8 character from the string RSI..RDI.
.OutEAX:PROC1 ; Send EAX to callback. Preserves RBX,RSI,RDI, updates ReturnRDI.
PUSH RBX,RSI,RDI
MOV RDI,[RSP+4*8] ; ReturnRDI restore.
CALL [RSP+12*8] ; CallbackProc.
MOV [RSP+4*8],RDI ; ReturnRDI update.
POP RDI,RSI,RBX
RET
ENDPROC1 .OutEAX:
.End:POP RDI,RSI,RBX,RDX,RCX,RAX
RET 4*8
ENDP1 DecodeUTF8@RT::
%ENDMACRO DecodeUTF8
Macro EncodeUTF8 converts a codepoint to an UTF-8 character and stores it into the string at RDI.
EncodeUTF8 %MACRO
CALL EncodeUTF8@RT:
EncodeUTF8@RT: PROC1
CMP EAX,0x0000_0080 ; ASCII character?
JAE .10:
STOSB ; Store it as is.
RET
.10: PUSH RAX,RCX
MOV ECX,EAX
CMP EAX,0x0000_0800 ; Two-byte encoding?
JAE .30:
SHR EAX,6
OR AL,0xC0
.20: STOSB
MOV EAX,ECX
AND AL,0xBF
OR AL,0x80
STOSB
POP RCX,RAX
RET
.30: CMP EAX,0x0001_0000 ; Three-byte encoding?
JAE .40:
XCHG AL,AH
SHR AL,4
OR AL,0xE0
STOSB
MOV EAX,ECX
SHL EAX,2
XCHG AL,AH
AND AL,0xBF
OR AL,0x80
JMP .20:
.40: SHR EAX,18 ; Four-byte encoding?
AND AL,0xF7
OR AL,0xF0
STOSB
MOV EAX,ECX
SHR EAX,12
AND AL,0xBF
OR AL,0x80
STOSB
MOV EAX,ECX
SHR EAX,6
AND AL,0xBF
OR AL,0x80
JMP .20:
ENDP1 EncodeUTF8@RT:
%ENDMACRO EncodeUTF8
Macro GetLengthUTF8 returns the length of the string in UTF-8 characters.
GetLengthUTF8 %MACRO String, Size=-1
PUSHQ %Size, %String
CALL GetLengthUTF8@RT
GetLengthUTF8@RT PROC1
PUSH RAX,RDX,RSI
MOV RCX,[RSP+5*8] ; %Size.
MOV RSI,[RSP+4*8] ; %String.
MOV RDX,RCX
XOR EAX,EAX
INC RCX
JZ .10: ; Jump when Size=-1.
LEA RDX,[RSI+RCX] ; At the end of String when Size<>-1.
.10: SUB ECX,ECX ; The returned String length.
.20: CMP RSI,RDX ; End of String?
JAE .90:
XOR EAX,EAX
LODSB
CMP EDX,-1 ; Is the NUL byte relevant?
JNE .30:
CMP AL,0 ; Yes, it is.
JE .90:
.30: CMP AL,0x7F ; Ordinary ASCII character?
JBE .80: ; Count +1 if yes.
NOT AL ; Scan bits 7..0 of the inverted first byte.
BSR EAX,EAX ; Returns EAX=5, 4, 3 for 2, 3, 4 bytes long UTF-8 character.
NEG RAX ; Returns RAX=-5,-4,-3 for 2, 3, 4 bytes long UTF-8 character.
LEA RSI,[RSI+RAX+6] ; Increments RSI by 1, 2, 3 for 2, 3, 4 bytes long UTF-8 character.
.80: INC ECX ; String length.
JMP .20:
.90: POP RSI,RDX,RAX
RET 2*8
ENDP1 GetLengthUTF8@RT
%ENDMACRO GetLengthUTF8
Macro Time1970 converts the time from 32bit C-time (number of seconds since midnight Jan 1st 1970 UTC to the ASCIIZ format yyyy/mm/dd hh:mm:ss.
Time1970 %MACRO DateTime32, DateTimeString
PUSH %DateTimeString, %DateTime32
CALL Time1970@RT
Time1970@RT PROC1
PUSH RAX,RBX,RCX,RDX,RSI,RDI
SUB RSP,16
%month %SETA 0
DaysInMonth %FOR 31,28,31,30,31,30,31,31,30,31,30,31
MOVB [RSP+%month],%DaysInMonth
%month %SETA %month+1
%ENDFOR DaysInMonth
MOV RDI,[RSP+10*8] ; %DateTimeString
MOV RAX,[RSP+ 9*8] ; %DateTime32
XOR EDX,EDX
MOV ECX,24*60*60
DIV RCX
; RAX is whole days since Jan 1st 1970. RDX is seconds since midnight.
MOV RSI,RDX
MOV ECX,365+365+366+365
XOR EDX,EDX
DIV RCX
LEA RBX,[4*RAX+1970] ; RBX is four-year. EDX is number of days since RBX (0..4*365)
SUB EDX,365
JB .30:
INC EBX
SUB EDX,365
JB .30:
INC EBX
SUB EDX,366
JB .20:
INC EBX
JMPS .40:
.20: INC EDX
.30: ADD EDX,365
.40: TEST BL,3 ; RBX is year. EDX is number of days since RBX (0..365)
JNZ .50:
MOVB [RSP+1],29
.50: MOV EAX,EBX
StoD RDI,Size=4
MOV AL,'/'
STOSB
XOR EBX,EBX
XOR ECX,ECX
.53: MOVZXB EAX,[RSP+RBX] ; 31,28|29,31,30,,,
ADD ECX,EAX ; 31,59|60,90|91,,,
INC EBX ; RBX is month (1..12).
.55: CMP ECX,EDX
JNA .53:
SUB ECX,EAX
SUB EDX,ECX
INC EDX ; RBX is month, RDX is day (1..31).
MOV EAX,EBX
StoD RDI,Size=2,Align=right,LeadingZeroes=yes ; Store month.
MOV AL,'/'
STOSB
MOV EAX,EDX
StoD RDI,Size=2,Align=right,LeadingZeroes=yes ; Store days.
MOV AL,' '
STOSB
MOV EAX,ESI ; EAX is seconds since midnight.
XOR EDX,EDX
MOV ECX,60*60
DIV RCX
StoD RDI,Size=2,Align=right,LeadingZeroes=yes ; Store hours.
MOV AL,':'
STOSB
MOV EAX,EDX
XOR EDX,EDX
MOV ECX,60
DIV RCX
StoD RDI,Size=2,Align=right,LeadingZeroes=yes ; Store minutes.
MOV AL,':'
STOSB
MOV EAX,EDX
StoD RDI,Size=2,Align=right,LeadingZeroes=yes ; Store seconds.
XOR EAX,EAX
STOSB
ADD RSP,16
POP RDI,RSI,RDX,RCX,RBX,RAX
RET 2*8
ENDP1 Time1970@RT
%ENDMACRO Time1970
ENDHEAD string64