This is module of EuroTool program EuroConv, OS independent.
EUROASM CPU=X64, DumpWidth=36
convmain PROGRAM Format=COFF, Width=64
%DROPMACRO *
INCLUDEHEAD argument.htm
INCLUDE1 status32.htm, cpuext.htm, cpuext64.htm, string64.htm
[.rodata] segment.
These sections are named
[CodePoint] (WORD), [Relevance] (BYTE), [Translit] (DWORD),
[Entity] (QWORD).
[.rodata] ; Declare global symbol at the beginning of each section: [CodePoint] ; Switch to section [CodePoint]. CodePoint:: [Relevance] ; Switch to section [Relevance]. Relevance:: [Translit] ; Switch to section [Translit]. Translit:: [Entity] ; Switch to section [Entity]. Entity:: UCP %MACRO CodePoint, Relevance, Translit, Entity ; Declare macroinstruction UCP for interpreting the lines ofunicode.htm. [CodePoint] DW 0x%CodePoint [Relevance] DB %Relevance [Translit] DD %Translit + 0 [Entity] DQ "%Entity" %ENDMACRO UCP INCLUDE unicode.htm ; Expand the macro UCP with each line of the table UnicodePoints. [CodePoint] CodePointEnd::
"ISO-8859-2";
"Latin 2 (Central European)";
https://en.wikipedia.org/wiki/Windows-1250;
[CPid].
[Names] one after another..
[Tables] one after another. The first six tables are omitted.
[.rodata] ; Declare global symbol at the beginning of each section: [CPid] CPid:: [Names] Names:: [Tables] Tables:: CP %MACRO CPid, CPname, CPaltName, CPurl, CPtable ; Declare macroinstruction CP for interpreting the lines ofcodepage.htm. [CPid] DW %CPid [Names] DB %CPname,0,%CPaltName,0 [Tables] tttt %FOR %*{1+4..128+4} DW 0x%tttt %ENDFOR tttt %ENDMACRO CP INCLUDE codepage.htm ; Expand the macro CP with each line of the table CodePages. [CPid] CPidEnd::
Replacement = 0xFFFD ; Codepoint of unsupported character.
[.rodata]
EuroConv:: DB "EuroConv",0
Version:: DB " version %^Date",0
Help:: DB 0xEF,0xBB,0xBF ; BOM UTF-8.
DB "; EuroCalc default configuration (UTF-8):",13,10
DB "/InputFile= ; Input file name to be converted.",13,10
DB "/InputEncoding= ; Encoding of InputFile; autodetect when not specified.",13,10
DB "/HeaderSize=0 ; Number of bytes to omit from conversion at the begining.",13,10
DB "/HeaderLength=0 ; Number of lines to omit from conversion at the begining.",13,10
DB "/FooterSize=0 ; Number of bytes to omit from conversion at the end.",13,10
DB "/FooterLength=0 ; Number of lines to omit from conversion at the end.",13,10
DB "/HtmlEntities=I ; Ignore|Convert-to character|NonASCII-convert-all-but-&-<->-".",13,10
DB "/OutputFile= ; Output file name where the converted input will be saved.",13,10
DB "/OutputEncoding= ; Encoding of OutputFile; UTF-8 when not specified.",13,10
DB "/ByteOrderMark=yes ; Use BOM in output UTF encoding.",13,10
DB "/InvalidCharacter=T ; Transliterate|Convert-to-HTML-entity|Question-mark|Omit.",13,10
DB 0
BOM_UTF32BE DB 0x00,0x00,0xFE,0xFF
BOM_UTF32LE DB 0xFF,0xFE,0x00,0x00
BOM_UTF16BE DB 0xFE,0xFF
BOM_UTF16LE DB 0xFF,0xFE
BOM_UTF8 DB 0xEF,0xBB,0xBF
[.bss]
WriteProxy:: D QWORD ; Addres of proxy procedure which writes codepoint EAX to FileOutput.
HeaderInMemPtr:: D QWORD ; Pointer to the FileInput contents, also header mapped in memory.
HeaderInMemEnd:: D QWORD ; End of FileInput header, mapped in memory.
FooterInMemPtr:: D QWORD ; Pointer to the footer mapped in memory.
FooterInMemEnd:: D QWORD ; Pointer to the end of footer, also end of FileInput mapped in memory.
OutputTable: D QWORD ; Pointer to the translation table for output encoding.
CodePagesLength:: D DWORD ; Number of members in CodePages table (1+5+75=81).
CodePointLength:: D DWORD ; Number of members in UnicodePoints table (1227).
ArgNr:: D DWORD ; Ordinal number of command-line argument.
Converted:: D DWORD ; Number of converted characters (including errors).
InputErrors:: D DWORD ; Number of bad input characters (not defined in the declared input encoding, also nonpaired surrogates).
OutputErrors:: D DWORD ; Number of bad output characters (could not be converted to the declared output encoding).
HtmlEntity: D 8*BYTE ; Room for HTML entity (without ampersand and semicolon).
[.text]
AutodetectEncoding:: PROC
MOV R8,[HeaderInMemEnd]
MOV R9,[FooterInMemPtr]
SUB R9,R8
MOV R12,65001 ; UTF-8.
CMP R9D,16
JB .90: ; Shorter files cannot be autodetected.
CMP R9D,256K
JB .10:
MOV R9D,256K
.10:MOV R10,0x8000_0000_0000_0000 ; R10 is the best relevance saldo so far. R12 is its code page.
XOR R13,R13 ; R13 is 0,1,2 corresponding with character size 1,2,4.
; Try encoding 20127 ASCII.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
XOR EAX,EAX
.15:LODSB
CMP AL,0x7F
JA .20:
CALL .StoreRelevance:
JMPS .22:
.20:ADD R11,?? ; Any byte above 0x7F deteriorates the relevance by ??=-32.
.22:LOOP .15:
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .30:
MOV R12,20127
MOV R10,R11
; Try encoding 12000 UTF-32LE.
MOV R13B,2 ; Character size=4.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
SHR ECX,2
MOV EAX,[BOM_UTF32LE]
CMP EAX,[RSI]
JNE .25:
ADD R11,Bm
ADD RSI,4
DEC ECX
.25:LODSD
CALL .StoreRelevance:
LOOP .25:
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .30:
MOV R12,12000
MOV R10,R11
.30: ; Try encoding 12001 UTF-32BE.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
SHR ECX,2
MOV EAX,[BOM_UTF32BE]
CMP EAX,[RSI]
JNE .35:
ADD R11,Bm
ADD RSI,4
DEC ECX
.35:LODSD
BSWAP EAX
CALL .StoreRelevance:
LOOP .35:
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .40:
MOV R12,12001
MOV R10,R11
.40: ; Try encoding 1200 UTF-16LE.
MOV R13B,1 ; Character size=2.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
SHR ECX,1
XOR EAX,EAX
MOV AX,[BOM_UTF16LE]
CMP AX,[RSI]
JNE .45:
ADD R11,Bm
ADD RSI,2
DEC ECX
.45:LODSW
CALL .StoreRelevance:
LOOP .45:
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .50:
MOV R12,1200
MOV R10,R11
.50: ; Try encoding 1201 UTF-16BE.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
SHR ECX,1
XOR EAX,EAX
MOV AX,[BOM_UTF16BE]
CMP AX,[RSI]
JNE .55:
ADD R11,Bm
ADD RSI,2
DEC ECX
.55:LODSW
XCHG AL,AH
CALL .StoreRelevance:
LOOP .55:
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .60:
MOV R12,1201
MOV R10,R11
.60: ; Try encoding 65001 UTF-8.
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
MOV EAX,[BOM_UTF8]
MOV EDX,[RSI]
AND EAX,0x00FFFFFF
AND EDX,0x00FFFFFF
CMP EAX,EDX
JNE .65:
ADD R11,Bm
ADD RSI,3
SUB RCX,3
.65:DecodeUTF8 RSI,.StoreRelevanceUTF8,Size=RCX,Width=32 ; Use macro from string64.htm.
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .70:
MOV R12,65001
MOV R10,R11
.70: ; Try 8bit encoding OEM or WIDE according to CodePages 437..28606.
XOR EDX,EDX ; RDX is index to [Tables].
XOR R13,R13 ; Character size=1.
.72:LEA RBX,[RDX+Tables::]
SUB R11,R11 ; R11=relevance saldo.
MOV RSI,R8 ; Restore pointer to text from R8.
MOV RCX,R9 ; Restore size of text from R9.
.75:XOR EAX,EAX
LODSB
CMP AL,0x7F
JBE .80:
MOV AX,[RBX+2*RAX-256] ; Translate AL (128..255) to unicode point in AX by this table.
.80:CALL .StoreRelevance: ; Add relevance of unicode point EAX to R11.
LOOP .75: ; The next character from the sample.
CMP R11,R10 ; Compare with the best saldo so far in R10.
JLE .85: ; Skip when poor.
MOV R10,R11 ; R10 is the best saldo so far.
LEA RCX,[CPid::]
MOVZX EAX,DH
MOVZXW R12,[RCX+2*RAX+2*6]; R12 is the best encoding so far.
.85:INC DH ; Try the next encoding.
LEA EAX,[EDX+256*6]
CMP AH,[CodePagesLength] ; Each OEM/ANSI table is 2*128 bytes long.
JB .72:
.90:MOV RAX,R12 ; Autodetected encoding is returned in RAX.
RET
.StoreRelevanceUTF8:: PROC ; This subprocedure will add relevance of a character decoded from UTF-8
XOR R13,R13 ; with codepoint RAX to the saldo in R11. Clobbers: RAX,RDI,R13.
CMP EAX,80h
JB .4:
INC R13
CMP EAX,800h
JB .4:
INC R13
.4: ; Continue with .StoreRelevance:. R13=0,1,2.
ENDP .StoreRelevanceUTF8:
.StoreRelevance: PROC ; This subprocedure will add relevance of a character
PUSH RCX,RSI ; with codepoint EAX to the saldo in R11. Clobbers: RAX,RDI.
XOR ESI,ESI
CMP EAX,0x0000_FFFF
JA .9: ; Characters above BMP (asian, emojis) do not influence the relevance.
LEA RDI,[CodePoint::]
LEA RSI,[RDI+2]
MOV ECX,[CodePointLength]
REPNE SCASW
JE .6:
MOV RSI,?? ; Deteriorate the relevance when this character is not in our Unicode table.
JMP .8:
.6: SUB RDI,RSI ; Otherwise find the corresponding relevance.
LEA RSI,[Relevance::]
SHR RDI,1
MOVSXB RSI,[RSI+RDI]
.8: MOV RCX,R13
SAL RSI,CL ; Double (UTF-16) or quadruple (UTF-32) the relevance.
ADD R11,RSI ; Add it to the saldo R11.
.9: POP RSI,RCX
CLC
RET
ENDP .StoreRelevance:
ENDP AutodetectEncoding
HeaderAndFooter:: PROC
; Restrict the input by ArgHeaderSize and ArgFooterSize.
MOV EAX,[ArgHeaderSize::]
ADD RAX,[HeaderInMemPtr]
MOV [HeaderInMemEnd],RAX
MOV RAX,[FooterInMemEnd]
MOV ECX,[ArgFooterSize::]
SUB RAX,RCX
MOV [FooterInMemPtr],RAX
CMP RAX,[HeaderInMemEnd]
JC .90: ; Return CF if Footer is below Header.
; Restrict the remaining input by ArgHeaderLength.
MOV EBX,[ArgHeaderLength::] ; How many lines to omit.
TEST EBX
JZ .40:
MOV EAX,10 ; LineFeed is a terminating character of records with variable size.
MOV RCX,[FooterInMemPtr]
MOV RDI,[HeaderInMemEnd]
SUB RCX,RDI
MOV EDX,[ArgInputEncoding::]
; EAX=10, EBX=header lines, EDX=encoding, RCX=file size, RDI=^file in memory.
Dispatch DX,1200,1201,12000,12001 ; UTF-16 and UTF-32 encodings.
.10: REPNE SCASB ; All other encodings keep the byte 10 as LineFeed.
JNE .35:
DEC EBX ; Header length.
JNZ .10:
JMP .35:
.1200:SHR ECX,1 ; UTF-16LE.
.15: REPNE SCASW
JNE .35:
DEC EBX ; Header length.
JNZ .15:
JMP .35:
.1201:SHR ECX,1 ; UTF-16BE.
XCHG AL,AH
.20: REPNE SCASW
JNE .35:
DEC EBX ; Header length.
JNZ .20:
JMP .35:
.12000:SHR ECX,2 ; UTF-32LE.
.25: REPNE SCASD
JNE .35:
DEC EBX ; Header length.
JNZ .25:
JMP .35:
.12001:SHR ECX,2 ; UTF-32BE.
BSWAP EAX
.30: REPNE SCASD
JNE .35:
DEC EBX ; Header length.
JNZ .30:
.35: MOV [HeaderInMemEnd],RDI
CMP RDI,[FooterInMemPtr]
CMC
JC .90:
.40: ; Restrict the input by ArgFooterLength.
MOV EBX,[ArgFooterLength::] ; How many lines to omit.
TEST EBX
JZ .90:
MOV EAX,10 ; LineFeed is a terminating character of records with variable size.
MOV RCX,[FooterInMemPtr]
MOV RSI,[HeaderInMemEnd]
MOV RDI,RCX
SUB RCX,RSI
JC .90:
INC EBX
; EAX=10,EBX=footer lines, RCX=file size, RDI=^end of file in memory.
STD
Dispatch DX,1200d,1201d,12000d,12001d
DEC RDI ; All other encodings keep the byte 10 as LineFeed.
.45: REPNE SCASB
JNE .80:
DEC EBX ; Footer length.
JNZ .45:
LEA RDI,[RDI+2]
JMP .80:
.1200d:SHR ECX,1 ; UTF-16LE.
LEA RDI,[RDI-2]
.50: REPNE SCASW
JNE .80:
DEC EBX ; Footer length.
JNZ .50:
LEA RDI,[RDI+2+2]
JMP .80:
.1201d:SHR ECX,1 ; UTF-16BE.
LEA RDI,[RDI-2]
XCHG AL,AH
.55: REPNE SCASW
JNE .80:
DEC EBX ; Footer length.
JNZ .55:
LEA RDI,[RDI+4]
JMP .80:
.12000d:SHR ECX,2 ; UTF-32LE.
LEA RDI,[RDI-4]
.60: REPNE SCASD
JNE .80:
DEC EBX ; Footer length.
JNZ .60:
LEA RDI,[RDI+4+4]
JMP .80:
.12001d:SHR ECX,2 ; UTF-32BE.
LEA RDI,[RDI-4]
BSWAP EAX
.70: REPNE SCASD
JNE .80:
DEC EBX ; Footer length.
JNZ .70:
LEA RDI,[RDI+4+4]
.80: CLD
MOV [FooterInMemPtr],RDI
CMP RDI,[HeaderInMemEnd] ; Return CF if Footer is below Header.
.90: RET
ENDP HeaderAndFooter
0x0000_00A0.
HtmlEntityDecode: PROC
PUSH RAX,RBX,RCX,RDX,RSI,RDI,R8,R9
XOR EAX,EAX
LEA RDI,[HtmlEntity] ; 8 bytes of temporary room for the entity.
MOV [RDI],RAX ; Clear HtmlEntity.
MOV R8D,8 ; Maximal possible size of HTML entity, without ampersand and semicolon.
MOV R9D,[ArgInputEncoding::]
MOV ECX,1
AND RCX,R9 ; RCX=1 for UTF-BE, RCX=0 for UTF-LE, RCX=? for other encodings.
.10:Dispatch R9W, 1200, 1201, 12000, 12001 ; Input encodings is UTF?
CMP RSI,RDX ; If not, then it is 8-bit ASCII|ANSI|OEM|UTF-8.
JNB .90:
LODSB
JMP .20:
.1201:
.1200: ; 16-bit characters.
LEA RBX,[RSI+1]
CMP RBX,RDX ; Out of input string?
JNB .90:
LODSW
JRCXZ .20:
XCHG AL,AH ; Input character is BigEndian.
JMP .20:
.12001:
.12000:
LEA RBX,[RSI+3]
CMP RBX,RDX ; Out of input string?
JNB .90:
LODSD
JRCXZ .20:
BSWAP EAX
.20:CMP EAX,';'
JE .30:
STOSB ; Store the ASCII character to temporary entity room.
DEC R8D
JNZ .10:
.30:LEA RDX,[HtmlEntity:] ; At the temporary entity, e.g. "euro", zero terminated.
CMPB [RDX],'#' ; Is it numeric entity?
JE .40:
MOV RAX,[RDX]
LEA RDI,[Entity:] ; Beginning of the [Entity] section.
LEA RBX,[RDI+8]
MOV ECX,[CodePointLength:]; Length ot the Unicode table.
REPNE SCASQ
JNE .90: ; If the &entity; was not found in Unicode points table.
SUB RDI,RBX ; Index in the table.
SHR RDI,2
XOR EAX,EAX
MOV AX,[RDI+CodePoint:] ; Select Unicode point corresponding with entity position in the table.
.35:CMPB [ArgHtmlEntities::],'C' ; Convert all HTML entities, including ASCII entities?
JE .80:
CMP EAX,0x0000_0080 ; Is it ASCII entity <, >, &, "?
JB .90: ; Ignore if yes.
JMP .80:
.40:INC RDX
MOV RCX,RSI ; Temporary save RSI.
MOV AL,[RDX]
AND AL,'x'^'X' ; Convert to lower case.
CMP AL,'x'
JNE .50:
INC RDX
LodH RDX ; Hexadecimal entity code expected.
JMP .60:
.50:LodD RDX ; Decimal entity code expected.
.60:JC .90: ; CF if wrong number.
MOV RSI,RCX ; Restore RSI to point behind the terminating semicolon.
JMP .35: ; Check if it's ASCII entity.
.80:MOV [RSP+3*8],RSI ; %ReturnRSI - behind the semicolon.
MOV [RSP+7*8],RAX ; %ReturnRAX - entity value.
.90:POP R9,R8,RDI,RSI,RDX,RCX,RBX,RAX
RET
ENDP HtmlEntityDecode:
HeaderInMemEnd: .. FooterInMemPtr:, respecting arguments /HtmlEntities and /InvalidCharacter.
Converted characters are written by calling ConvertChar, which converts it from Unicode point and finally calls a procedure
specified in [WriteProxy]. This procedure is specific for Linux and for Windows, but this module
convmain.htm
is OS-independent.
ConvFile first converts input character in ArgInputEncoding:: to Unicode point (32-bit) and then converts the Unicode point to the final ArgOutputEncoding::.
HeaderInMemPtr .. FooterInMemEnd maps the input data.
WriteProxy: contains pointer to the function WriteFileProxy
defined in Linux or Windows module, which writes the output data.
ConvFile:: PROC
MOV EAX,[ArgOutputEncoding::]
JNSt [Status::],ArgBOM, .05:
Dispatch AX,1200d, 1201d,12000d,12001d,65001d
.05:LEA RDI,[CPid]
LEA RDX,[RDI+7*2] ; Here at RDX start those CodePages, which have translation table.
MOV ECX,[CodePagesLength]
REPNE SCASW
JE .10: ; Output encoding was found.
MOV [ArgOutputEncoding::],EAX
JMP .15:
.1200d:
.1201d:
.12000d:
.12001d:
.65001d:
MOV EAX,0x0000_FEFF
CALL ConvertChar:
MOV EAX,[ArgOutputEncoding::]
JMP .05:
.10:SUB RDI,RDX
JNA .15:
SHL EDI,7 ; Output encodings is 8-bit, let's select the translation OutputTable..
LEA RBX,[RDI+Tables:]
MOV [OutputTable],RBX ; Conversion table for output (to 8-bit) encoding.
.15:MOV RSI,[HeaderInMemEnd]
MOV RDX,[FooterInMemPtr]
TEST RSI
JZ .80:
MOV EAX,[ArgInputEncoding::]
XOR ECX,ECX ; After Dispatch RCX will be zero for input encoding UTF-LE or not zero for UTF-BE.
Dispatch AX, 65001, 20127, 1200, 1201, 12000, 12001 ; Input encodings UTF or ASCII.
LEA RDI,[CPid] ; Undispatched input encoding is 8bit, let's select the translation input table to RBX.
MOV ECX,[CodePagesLength]
LEA RBX,[RDI+7*2] ; Here at RBX start those CodePages, which have translation table.
REPNE SCASW
SUB RDI,RBX
JNA .80:
SHL EDI,7
LEA RBX,[RDI+Tables:] ; RBX is the conversion table for input (from 8-bit) encoding.
.20:CMP RSI,RDX ; Convert RSI..RDX from OEM|ANSI 8bit encoding to UnicodePoints (UTF-32LE) in EAX.
JNB .90: ; Return with CF=0.
XOR EAX,EAX
LODSB
CMP AL,128
JB .25:
MOV AX,[RBX+2*RAX-256] ; Translate character AL=0x80..0xFF to AX=UnicodePoint.
CMP AX,Replacement
CMC ; CF=1 if AX=0xFFFD, 0xFFFE or 0xFFFF (replacement or undefined).
ADCD [InputErrors],0
.25:CMPB [ArgHtmlEntities::],'I' ; Skip if input HTML entity should be ignored (treated as ordinary string).
JE .30:
CMP EAX,'&' ; Possible beginning of HTML entity?
JNE .30:
CALL HtmlEntityDecode:
.30:; Unicode point EAX will be encoded to OutputEncoding and stored by [WriteProxy] procedure.
CALL ConvertChar: ; Convert the codepoint EAX to output encoding and write it to the output file.
JMP .20: ; Read the next character.
.65001: ; Convert RSI..RDX from UTF-8 encoding to UnicodePoint (UTF-32LE) in EAX.
CMP RSI,RDX
JNB .90:
CMPB [ArgHtmlEntities::],'I'
JE .33:
CMPB [RSI],'&'
JNE .33:
XOR EAX,EAX
LODSB
CALL HtmlEntityDecode:
JMP .34:
.33: LodUTF8 RSI ; Use macro from library cpuext64.htm.
.34: CALL ConvertChar:
JNC .65001:
JMP .90:
.20127: ; Do not convert ASCII encoded text, store it as UnicodePoints 0x0000_0000..0x0000_00FF.
CMP RSI,RDX
JNB .90: ; Return with CF=0 when end of file.
XOR EAX,EAX
LODSB
CMP AL,127
JBE .35:
INC [InputErrors:]
MOV EAX,Replacement
.35:CMPB [ArgHtmlEntities::],'I'
JE .38:
CMP AX,'&'
JNE .38:
CALL HtmlEntityDecode
.38:CALL ConvertChar: ; Write codepoint EAX to the output file.
JMP .20127:
.1201:DEC RCX ; Signalize BigEndian on input.
.1200: ; Convert RSI..RDX fro8 UTF-16 encoding to UnicodePoint (UTF-32LE) in EAX.
CMP RSI,RDX
JNB .90: ; Return with CF=0 when end of file.
XOR EAX,EAX
LODSW
JRCXZ .40:
XCHG AL,AH ; Change BigEndian to LittleEndian.
.40:CMPB [ArgHtmlEntities::],'I'
JE .43:
CMP AX,'&'
JNE .43:
CALL HtmlEntityDecode:
.43:CMP EAX,0xD800
JB .45:
CMP EAX,0xDBFF
JA .45:
SUB EAX,0xD800 ; EAX is high surrogate 0xD800..0xDBFF. Put its value to EBX and then expect the low surrogate.
MOV EBX,EAX
SHL EBX,10
CMP RSI,RDX ; Low surrogate expected.
JB .60:
CMC
ADCD [InputErrors],0 ; Ignore the high surrogate and count an error.
JMP .1200:
.45:CMP EAX,0xDC00 ; Ordinary character or high surrogate expected.
JB .70:
CMP EAX,0xDFFF
JA .70:
.50:INCD [InputErrors] ; EAX is orphaned low surrogate (without preceeding high surrogate). Count error, ignore and read the next.
JMP .1200:
.55:SUB RSI,2 ; Low surrogate is missing after high surrogate. Count as an error, go back and read an ordinary UTF-16.
JMP .50:
.60:LODSW ; Low surrogate expected and nothing else.
JRCXZ .65:
XCHG AL,AH ; Change BigEndian to LittleEndian.
.65:CMP EAX,0xDC00
JB .55:
CMP EAX,0xDFFF
JA .55:
SUB EAX,0xDC00
LEA EAX,[RAX+RBX+0x10000]; Complete the codepoint from high and low surrogates.
.70:CALL ConvertChar: ; Convert the codepoint EAX to output encoding and write it to the output file.
JMP .1200:
.12001:DEC RCX ; Signalize BigEndian on input.
.12000: ; Convert RSI..RDX from UTF-32 encoding to UnicodePoint (UTF-32LE) in EAX.
CMP RSI,RDX
JNB .90: ; Return with CF=0 at EOF.
LODSD
JRCXZ .75:
BSWAP EAX ; Change BigEndian to LittleEndian.
.75:CMPB [ArgHtmlEntities::],'I'
JE .78:
CMP EAX,'&'
JNE .78:
CALL HtmlEntityDecode:
.78:CALL ConvertChar: ; Convert the codepoint EAX to output encoding and write it to the output file.
JMP .12000:
.80:STC
.90:RET
ENDP ConvFile::
[OutputTable] contains a pointer to the translation table if output encoding is 8-bit OEM|ANSI.
ArgInvalidCharacter in its first letter tells how to treat character
which does not exist in output encoding.
[WriteProxy] contains pointer to the function WriteFileProxy for the output string(s) RDI,RCX.
ConvertChar: PROC
PUSH RCX,RDX,RSI
INC [Converted:]
MOV EDX,[ArgOutputEncoding::]
Dispatch DX, 65001, 20127, 1200, 1201, 12000, 12001 ; Special encodings UTF or ASCII.
CMP EAX,0x0000_007F
JBE .80:
MOV RDI,[OutputTable] ; Convert UnicodePoint EAX to the output 8-bit encoding defined by [OutputTable].
MOV ECX,128
LEA RSI,[RDI+2-256]
REPNE SCASW ; Try to find AX in output encoding table.
JNE .Inv:
SUB RDI,RSI
MOV EAX,EDI
SHR EAX,1 ; AL is now a character in output 8-bit encoding (0x80..0xFF).
JMP .80:
.12001: ; Convert UnicodePoint EAX to the output encoding UTF-32BE.
BSWAP EAX
.12000: ; Convert UnicodePoint EAX to the output encoding UTF-32LE.
CALL [WriteProxy:]
SHR EAX,8
CALL [WriteProxy:]
SHR EAX,8
CALL [WriteProxy:]
SHR EAX,8
JMP .80:
.1201:
.1200: ; Convert UnicodePoint EAX to the output encoding UTF-16.
CMP EAX,0x0000_D800
JB .40:
CMP EAX,0x0000_DFFF
JBE .Inv: ; Codepoints in surrogate range 0xD800..0xDFFF are invalid.
CMP EAX,0x0000_FFFF
JBE .40:
CMP EAX,0x001F_FFFF
JA .Inv:
; Codepoint EAX in range 0x0001_0000..0x0010_FFFF will be stored as two surrogate 16-bit code units.
SUB EAX,0x0001_0000
MOV EBX,EAX
SHR EAX,10
ADD EAX,0x0000_D800 ; High surrogate.
CMP DX,1201
JNE .30:
XCHG AL,AH
.30:CALL [WriteProxy:]
SHR EAX,8
CALL [WriteProxy:]
LEA EAX,[RBX+0x0000_DC00] ; Low surrogate.
.40:CMP DX,1201 ; Big Endian?
JNE .50:
XCHG AL,AH
.50:CALL [WriteProxy:]
SHR EAX,8
JMP .80:
.65001: ; Convert UnicodePoint EAX to the output encoding UTF-8.
PUSH RAX
MOV RDI,RSP
MOV RSI,RSP
EncodeUTF8 ; Use macro from the library string64.htm.
.60: LODSB
CALL [WriteProxy:]
CMP RSI,RDI
JB .60:
POP RAX
JMP .90:
.20127:
CMP EAX,0x0000_007F ; When output encoding is ASCII, all codepoints above 0x0000_007F are invalid.
JBE .80:
.Inv: ; Character AX does not exist in output encoding, it is invalid.
INC [OutputErrors]
MOV CL,[ArgInvalidCharacter::]
Dispatch CL,'C','Q','O' ; How to treat invalid characters.
;.T: ; Transliterate invalid character AX to ASCII.
LEA RDI,[CodePoint:]
LEA RSI,[RDI+2]
MOV ECX,[CodePointLength:] ; Length of UnicodePoints table.
REPNE SCASW
JNE .Q:
SUB RDI,RSI
MOV EAX,[2*RDI+Translit:]
.T5:CALL [WriteProxy:]
SHR RAX,8
TEST AL
JNZ .T5:
.O: ; Ignore (omit) invalid character.
JMP .90:
.Q: ; Replace invalid character AX with '?'.
MOV EAX,'?'
JMP .80:
.C: ; Convert invalid character AX to HTML entity in hexa notation.
PUSH RAX ; Make temporary 8-bytes room.
MOVD [RSP],''
LEA RDI,[RSP+3]
StoH RDI, Size=4
MOV AL,';'
STOSB
POP RAX
JMP .T5:
.80:CALL [WriteProxy:]
.90:POP RSI,RDX,RCX
RET
ENDP ConvertChar:
ENDPROGRAM convmain