This file can be included to 16bit programs written in Euro Assembler.
It contains OS-independent macros for zero-terminated (ASCIIZ) string operations.
All functions expect direction flag on input be zero and they do not change it.
Similar macros with identical names for different program width are defined in string32.htm and string64.htm.
string16 HEAD
GetLength$ %MACRO String %IF "%String" !== "CX" MOV CX,%String %ENDIF CALL GetLength$@RT:: GetLength$@RT:: PROC1 PUSH AX,DI MOV DI,CX XOR AX,AX XOR CX,CX CLD DEC CX REPNE SCASB NOT CX DEC CX POP DI,AX RET ENDPROC1 GetLength$@RT:: %ENDMACRO GetLength$
SIZE# %Destination
.
Concat$ %MACRO Destination, Source,,, Size= %IF %# < 2 ; > %ERROR ID=5930, 'Missing operand of macro "Concat$".' %EXITMACRO Concat$ %ENDIF PUSH BP MOV BP,SP ; Store stack pointer. ArgNr %FOR %#..2, STEP= -1 PUSHW %*{%ArgNr} ; All Source pointers, starting with the last. %ENDFOR ArgNr PUSHW %# - 1 ; Number of Source strings to concatenate. %IF "%Size" === "" PUSHW SIZE# %Destination %ELSE PUSHW %Size %ENDIF PUSHW %Destination CALL Concat$@RT:: MOV SP,BP ; Restore stack. POP BP Concat$@RT:: PROC1 PUSHAW MOV BP,SP MOV DI,[BP+18] ; %Destination. MOV DX,[BP+20] ; %Size. MOV CX,[BP+22] ; Number of source strings. ADD DX,DI CLD DEC DX ; End of allocated Destination. .20: MOV SI,[BP+24] ; Source pointer. .30: LODSB CMP AL,0 JE .40: CMP DI,DX CMC JC .80: ; If destination size overflowed. STOSB JMP .30: .40: INC BP,BP ; The next Source pointer. LOOP .20: .80: MOV AL,0 STOSB POPAW RET ENDP1 Concat$@RT:: %ENDMACRO Concat$
Compare$ %MACRO String1, String2 %IF "%String2" === "" PUSHW DI %ELSE PUSHW %String2 %ENDIF %IF "%String1" === "" PUSHW SI %ELSE PUSHW %String1 %ENDIF CALL Compare$@RT:: Compare$@RT:: PROC1 PUSHAW MOV BP,SP SUB AX,AX MOV DI,[BP+20] ; %String2. MOV CX,-1 MOV BX,DI CLD REPNE:SCASB ; Search for the terminator of string ES:DI. SUB DI,BX ; Size of String2 including the NUL. MOV DX,DI MOV DI,[BP+18] ; %String1$. MOV CX,-1 MOV SI,DI PUSH ES,DS POP ES ; Temporarily load ES from DS. REPNE:SCASB ; Search for the terminator of string DS:SI. POP ES MOV CX,DI SUB CX,SI ; Size of %String1 including the NUL. CMP CX,DX ; Compare string sizes. JNE .90 ; If sizes do not match. MOV DI,BX ; String1$. REPE CMPSB .90:POPAW RET 2*2 ENDPROC1 Compare$@RT:: %ENDMACRO Compare$
Macro DecodeUTF8 converts Source UTF-8 string to UTF-16 or UTF-32 string
in 16bit CPU mode. It requires CPU 386 or higher.
Source string is either zero-terminated, or its Size= must be specified.
Conversion stops at NUL byte, which is not converted to output.
Input never reads beyond Source+Size.
If Byte Order Mark (BOM, 0xEF,0xBB,0xBF
) is detected at the beginning of the Source string, it is ignored.
Invalid UTF-8 sequence will send a replacement character 0xFFFD
� to the output.
Byte order in output encoding is always LittleEndian, the same which is used in MS Windows WIDE functions.
If you want to produce UTF-16BE, performXCHG AL,AH
in CallbackProc.
If you want to produce UTF-32BE, performBSWAP EAX
in CallbackProc.
If you want to prefix the output string with BOM, store it to destination buffer before invoking DecodeUTF8.
If you don't like replacement characters (usually displayed as little squares �), filter them out in CallbackProc.
0x0000_D800..0x0000_DFFF
when the input UTF-8 character
belongs to Unicode supplementary planes (Emoji, Asian characters etc).
0x0000_FFFD
when the input UTF-8 string is malformed.
DecodeUTF8 %MACRO Source, CallbackProc, Size=-1, Width=16
%IF %Width != 16 && %Width != 32
%ERROR ID=5932,'Macro "DecodeUTF8" requires Width=16 or Width=32.'
%EXITMACRO DecodeUTF8
%ENDIF
PUSHW %Width, %Size, %CallbackProc, %Source
CALL DecodeUTF8@RT::
DecodeUTF8@RT:: PROC1
PUSHAW
MOV BP,SP
PUSH ES
SUB CX,CX
MOV [BP+12],CX ; Initialize %ReturnCX to 0.
MOV DI,[BP+18] ; %Source.
MOV CX,[BP+22] ; %Size.
MOV SI,DI
MOV AX,CX
INC AX
JZ .Scan: ; If Size=-1, AX=0 and the Source size will be scanned.
ADD DI,CX ; Otherwise use the explicit %Size.
JNC .No0:
MOV DI,-1
JMP .No0:
.Scan:REPNE:SCASB
JNE .No0:
DEC DI ; Omit the terminator from conversion.
.No0: ; Source string without NUL is now at ES:SI..ES:DI.
BOM %FOR 0xEF,0xBB,0xBF ; Little-Endian BOM (0xFEFF
) encoded in UTF-8.
CMP SI,DI
JNB .NoBOM:
LODSB [ES:SI]
CMP AL,%BOM
JNE .NoBOM:
%ENDFOR BOM
JMP .Start: ; BOM was detected, SI is advanced just behind it.
.NoBOM:MOV SI,[BP+18] ; No BOM detected, restore SI to the start of Source again.
.Start:CMP SI,DI ; Decode one UTF8 character from the string ES:SI..ES:DI.
JNB .End:
XOR EBX,EBX
LODSB [ES:SI]
MOV BL,AL
NOT BL
BSR CX,BX ; Scan bits 7..0 of inverted first byte of 1,2,3,4 bytes long UTF-8 character.
MOV BL,AL ; First byte of 1,2,3,4 bytes long UTF-8 character (not inverted).
MOV DL,0x7F ; Prepare mask for payload bits in the 1st UTF-8 byte.
SUB CX,7 ; CX=7,5,4,3 change to CX=0,-2,-3,-4.
JZ .Out: ; Done when BX is codepoint 0..0x7F (7bit ASCII character).
NEG CX ; ECX=2,3,4 (number of bytes in UTF-8 character).
SHR DL,CL ; DL=0x1F,0x0F,0x07 is the payload mask of the first UTF-8 byte.
AND BL,DL ; EBX will accumulate payload bits of codepoint.
CMP CL,2
JB .Invalid:
CMP CL,4
JBE .Good:
.Invalid:
MOV EAX,0xFFFD ; Invalid UTF-8 detected, output the replacement instead.
JMP .NoSg:
.Good:DEC CX ; CX=1, 2 or 3 continuation bytes 10xxxxxxb expected.
MOV AX,SI ; Check if there's that many input bytes left.
ADD AX,CX
CMP AX,DI
JBE .Cont:
DEC SI ; Rollback, the last UTF-8 character is incomplete.
SUB DI,SI ; DI characters (1..3) were not decoded.
MOV [BP+12],DI ; %ReturnCX.
JMP .End: ; CF=0.
.Cont:LODSB [ES:SI] ; Continuation byte AL=10xxxxxxb expected.
BTR AX,7 ; Reset the marker bit 7.
JNC .Invalid:
BTR AX,6
JC .Invalid:
SHL EBX,6 ; Make room in EBX for the next 6 bits.
OR BL,AL ; Accumulate them.
DEC CX
JNZ .Cont:
.Out: MOV EAX,EBX ; EAX=EBX is now the decoded codepoint 0..0x10_FFFF.
; Check for overlong encodings. DL=0x7F,0x1F,0x0F,0x07 for 1,2,3,4 bytes in UTF-8 character.
CMP EBX,0x01_0000 ; Codepoint 0x01_0000..0x10_FFFF should be encoded in 4 bytes.
JAE .NoOverlong:
CMP BX,0x00_0800 ; Codepoint 0x00_0800..0x00_FFFF should be encoded in 3 bytes.
JB .2Bts:
CMP DL,0x0F
JE .NoOverlong:
JMP .Invalid:
.2Bts:CMP BX,0x00_0080 ; Codepoint 0x00_0080..0x00_07FF should be encoded in 2 bytes.
JB .1Bts:
CMP DL,0x1F
JE .NoOverlong:
JMP .Invalid:
.1Bts:CMP DL,0x7F ; Codepoint 0x00_0000..0x00_007F should be encoded in 1 byte.
JE .NoOverlong:
TEST BX
JNZ .Invalid:
CMP DL,0x1F ; Exception: codepoint 0 may be encoded in 1 or 2 bytes.
JNE .Invalid:
.NoOverlong:
SHR EBX,11 ; Check for surrogate codepoints.
CMP BL,0x1B
JE .Invalid: ; Do not accept surrogates 0xD800..0xDFFF from input.
TEST BX,0x3E0
JZ .NoSg: ; If codepoint EAX is below 0x0001_0000, surrogates do not apply.
CMPW [BP+24],16 ; Output UTF %Width (16 or 32).
JNE .NoSg: ; UTF-32 does not need surrogates.
SUB EAX,0x0001_0000 ; Codepoint EAX was not encodable in one UTF-16 character.
MOV EBX,0x0000_03FF ; Use two surrogate Unichars.
AND EBX,EAX
SHR EAX,10
ADD EBX,0x0000_DC00 ; EBX is now low surrogate.
ADD EAX,0x0000_D800 ; EAX is now high surrogate.
CALL .OutEAX: ; High surrogate first.
MOV EAX,EBX ; Low surrogate.
JC .End: ; If aborted by CallbackProc.
.NoSg:CALL .OutEAX: ; Low surrogate or BMP codepoint or UTF-32.
JNC .Start: ; Parse the next UTF-8 character from string ES:SI..ES:DI.
.End:POP ES
POPAW
RET 2*4
.OutEAX:PROC ; Send EAX to callback. Preserves EBX,ES,SI,DI, updates %ReturnDI.
PUSH EBX,ES,SI,DI
MOV DI,[BP+0] ; Restore %ReturnDI from DecodeUTF8@RT frame.
PUSH BP
CALL [BP+20] ; %CallbackProc.
POP BP
MOV [BP+0],DI ; Update %ReturnDI from DecodeUTF8@RT frame.
POP DI,SI,ES,EBX
RET
ENDPROC .OutEAX:
ENDP1 DecodeUTF8@RT::
%ENDMACRO DecodeUTF8
ENDHEAD string16