113 lines
4.8 KiB
Plaintext
113 lines
4.8 KiB
Plaintext
#UTF8_codePointMaxByteCount = 4 ;UTF-8 encoding uses only a maximum of 4 bytes to encode a codepoint
|
|
|
|
Procedure UTF8_encode(x, Array encoded_codepoint.a(1)) ;x is codepoint to encode, the array will contain output
|
|
;Array encoded_codepoint() is used for output.
|
|
;After encode element zero holds the count of significant bytes in elements 1 to 4
|
|
If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount
|
|
ReDim encoded_codepoint.a(#UTF8_codePointMaxByteCount)
|
|
EndIf
|
|
|
|
Select x
|
|
Case 0 To $7F
|
|
encoded_codepoint(0) = 1
|
|
encoded_codepoint(1) = x ;all 7 bits
|
|
Case $80 To $7FF
|
|
encoded_codepoint(0) = 2
|
|
encoded_codepoint(2) = (x & %00111111) | %10000000 ;lowest 6 bits
|
|
encoded_codepoint(1) = (x >> 6) | %11000000 ;highest bits 7 -> 11
|
|
Case $800 To $FFFF
|
|
encoded_codepoint(0) = 3
|
|
encoded_codepoint(3) = (x & %00111111) | %10000000 ;lowest 6 bits
|
|
encoded_codepoint(2) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12
|
|
encoded_codepoint(1) = (x >> 12) | %11100000 ;highest bits 13 -> 16
|
|
|
|
Case $10000 To $10FFFF
|
|
encoded_codepoint(0) = 4
|
|
encoded_codepoint(4) = (x & %00111111) | %10000000 ;lowest 6 bits
|
|
encoded_codepoint(3) = ((x >> 6) & %00111111) | %10000000 ;bits 7 -> 12
|
|
encoded_codepoint(2) = ((x >> 12) & %00111111) | %10000000 ;bits 13 -> 18
|
|
encoded_codepoint(1) = (x >> 18) | %11110000 ;highest bits 19 -> 21
|
|
Default
|
|
encoded_codepoint(0) = 0 ;error, codepoint is not valid and can't be encoded
|
|
EndSelect
|
|
EndProcedure
|
|
|
|
Procedure UTF8_decode(Array encoded_codepoint.a(1))
|
|
;Array encoded_codepoint() holds the UTF-8 encoding in elements 1 to 4, element zero isn't used for decoding.
|
|
Protected x = -1 ;initialzie with error value for possible improper encoding
|
|
|
|
If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount
|
|
ProcedureReturn x ;Input array was not dimensioned properly.
|
|
EndIf
|
|
|
|
;Determine the number of bytes in the UTF8 encoding by looking at first byte
|
|
;and then proceeding accordingly.
|
|
Select encoded_codepoint(1)
|
|
Case %00000000 To %01111111 ;1 byte encoding
|
|
x = encoded_codepoint(1)
|
|
Case %11000000 To %11011111 ;2 byte encoding
|
|
x = (encoded_codepoint(1) & %00011111) << 6 ;last 5 bits only
|
|
x | (encoded_codepoint(2) & %00111111)
|
|
Case %11100000 To %11101111 ;3 byte encoding
|
|
x = (encoded_codepoint(1) & %00001111) << 6 ;last 4 bits only
|
|
x << 6 + (encoded_codepoint(2) & %00111111)
|
|
x << 6 + (encoded_codepoint(3) & %00111111)
|
|
Case %11110000 To %11110111 ;4 byte encoding
|
|
x = (encoded_codepoint(1) & %00000111) << 6 ;last 3 bits only
|
|
x << 6 + (encoded_codepoint(2) & %00111111)
|
|
x << 6 + (encoded_codepoint(3) & %00111111)
|
|
x << 6 + (encoded_codepoint(4) & %00111111)
|
|
EndSelect
|
|
|
|
ProcedureReturn x
|
|
EndProcedure
|
|
|
|
;helper procedure to format output for this example
|
|
Procedure.s formatOutput(c$, c, Array encoded_utf.a(1), dcp) ;character, codepooint, UTf8 encoding, decoded codepoint
|
|
Protected o$, i, encoding$
|
|
|
|
o$ = " " + LSet(c$, 8) + LSet("U+" + RSet(Hex(c), 5, "0"), 10)
|
|
For i = 1 To encoded_utf(0)
|
|
encoding$ + RSet(Hex(encoded_utf(i)), 2, "0") + " "
|
|
Next
|
|
o$ + " " + LSet(encoding$, 11, " ") + " " + RSet(Hex(dcp), 5, "0")
|
|
|
|
ProcedureReturn o$
|
|
EndProcedure
|
|
|
|
DataSection
|
|
;unicode code points in hex
|
|
unicode_codepoints:
|
|
Data.i 5, $41, $F6, $416, $20AC, $1D11E
|
|
;The names for these codepoints are: latin capital letter a; latin small letter o With diaeresis
|
|
;cyrillic capital letter zhe; euro sign; musical symbol g clef.
|
|
EndDataSection
|
|
|
|
;read initial unicode codepoint values
|
|
Restore unicode_codepoints
|
|
Read num_codepoints
|
|
num_codepoints - 1
|
|
|
|
Dim codepoint(num_codepoints)
|
|
For i = 0 To num_codepoints
|
|
Read codepoint(i)
|
|
Next
|
|
|
|
;This array is used for input and output from the UTF8 encode and decode procedures. After encoding its elements
|
|
;hold the byte count of the encoding followed by the respective bytes. For decoding element zero is not used and
|
|
;elements 1 To 4 holds the bytes to be decoded.
|
|
Dim encoded_codepoint.a(#UTF8_codePointMaxByteCount)
|
|
If OpenConsole("", #PB_UTF8)
|
|
PrintN(LSet("", 11) + LSet("Unicode", 12) + LSet("UTF-8",14) + LSet("Decoded",12))
|
|
PrintN(LSet("Character", 11) + LSet("Code Point", 12) + LSet("Encoding",14) + LSet("Code Point",12))
|
|
PrintN(LSet("---------", 11) + LSet("----------", 12) + LSet("-----------",14) + LSet("-----------",12))
|
|
|
|
For i = 0 To num_codepoints
|
|
UTF8_encode(codepoint(i), encoded_codepoint())
|
|
dcp = UTF8_decode(encoded_codepoint()) ;Decoded UTF-8 encoding should match original codepoint that was encoded.
|
|
PrintN(formatOutput(Chr(codepoint(i)), codepoint(i), encoded_codepoint(), dcp))
|
|
Next
|
|
Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
|
|
CloseConsole()
|
|
EndIf
|