RosettaCodeData/Task/UTF-8-encode-and-decode/PureBasic/utf-8-encode-and-decode.basic

#UTF8_codePointMaxByteCount = 4 ;UTF-8 encoding uses only a maximum of 4 bytes to encode a codepoint

Procedure UTF8_encode(x, Array encoded_codepoint.a(1)) ;x is codepoint to encode, the array will contain output
  ;Array encoded_codepoint() is used for output.
  ;After encode element zero holds the count of significant bytes in elements 1 to 4
  If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount
    ReDim encoded_codepoint.a(#UTF8_codePointMaxByteCount)
  EndIf

  Select x
    Case 0 To $7F
      encoded_codepoint(0) = 1
      encoded_codepoint(1) = x ;all 7 bits
    Case $80 To $7FF
      encoded_codepoint(0) = 2
      encoded_codepoint(2) = (x & %00111111) | %10000000         ;lowest 6 bits
      encoded_codepoint(1) = (x >> 6) | %11000000                ;highest bits 7 -> 11
    Case $800 To $FFFF
      encoded_codepoint(0) = 3
      encoded_codepoint(3) = (x & %00111111) | %10000000         ;lowest 6 bits
      encoded_codepoint(2) = ((x >> 6) & %00111111) | %10000000  ;bits 7 -> 12
      encoded_codepoint(1) = (x >> 12) | %11100000               ;highest bits 13 -> 16

    Case $10000 To $10FFFF
      encoded_codepoint(0) = 4
      encoded_codepoint(4) = (x & %00111111) | %10000000         ;lowest 6 bits
      encoded_codepoint(3) = ((x >> 6) & %00111111) | %10000000  ;bits 7 -> 12
      encoded_codepoint(2) = ((x >> 12) & %00111111) | %10000000 ;bits 13 -> 18
      encoded_codepoint(1) = (x >> 18) | %11110000               ;highest bits 19 -> 21
    Default
      encoded_codepoint(0) = 0  ;error, codepoint is not valid and can't be encoded
  EndSelect
EndProcedure

Procedure UTF8_decode(Array encoded_codepoint.a(1))
  ;Array encoded_codepoint() holds the UTF-8 encoding in elements 1 to 4, element zero isn't used for decoding.
  Protected x = -1 ;initialzie with error value for possible improper encoding

  If ArraySize(encoded_codepoint()) < #UTF8_codePointMaxByteCount
    ProcedureReturn x ;Input array was not dimensioned properly.
  EndIf

  ;Determine the number of bytes in the UTF8 encoding by looking at first byte
  ;and then proceeding accordingly.
  Select encoded_codepoint(1)
    Case %00000000 To %01111111 ;1 byte encoding
      x = encoded_codepoint(1)
    Case %11000000 To %11011111 ;2 byte encoding
      x = (encoded_codepoint(1) & %00011111) << 6 ;last 5 bits only
      x | (encoded_codepoint(2) & %00111111)
    Case %11100000 To %11101111 ;3 byte encoding
      x = (encoded_codepoint(1) & %00001111) << 6 ;last 4 bits only
      x << 6 + (encoded_codepoint(2) & %00111111)
      x << 6 + (encoded_codepoint(3) & %00111111)
    Case %11110000 To %11110111 ;4 byte encoding
      x = (encoded_codepoint(1) & %00000111) << 6 ;last 3 bits only
      x << 6 + (encoded_codepoint(2) & %00111111)
      x << 6 + (encoded_codepoint(3) & %00111111)
      x << 6 + (encoded_codepoint(4) & %00111111)
  EndSelect

  ProcedureReturn x
EndProcedure

;helper procedure to format output for this example
Procedure.s formatOutput(c$, c, Array encoded_utf.a(1), dcp) ;character, codepooint, UTf8 encoding, decoded codepoint
  Protected o$, i, encoding$

  o$ = "   " + LSet(c$, 8) + LSet("U+" + RSet(Hex(c), 5, "0"), 10)
  For i = 1 To encoded_utf(0)
    encoding$ + RSet(Hex(encoded_utf(i)), 2, "0") + " "
  Next
  o$ + "  " + LSet(encoding$, 11, " ") + "   " + RSet(Hex(dcp), 5, "0")

  ProcedureReturn o$
EndProcedure

DataSection
  ;unicode code points in hex
  unicode_codepoints:
  Data.i 5, $41, $F6, $416, $20AC, $1D11E
  ;The names for these codepoints are: latin capital letter a; latin small letter o With diaeresis
  ;cyrillic capital letter zhe; euro sign; musical symbol g clef.
EndDataSection

;read initial unicode codepoint values
Restore unicode_codepoints
Read num_codepoints
num_codepoints - 1

Dim codepoint(num_codepoints)
For i = 0 To num_codepoints
  Read codepoint(i)
Next

;This array is used for input and output from the UTF8 encode and decode procedures.  After encoding its elements
;hold the byte count of the encoding followed by the respective bytes.  For decoding element zero is not used and
;elements 1 To 4 holds the bytes to be decoded.
Dim encoded_codepoint.a(#UTF8_codePointMaxByteCount)
If OpenConsole("", #PB_UTF8)
  PrintN(LSet("", 11) + LSet("Unicode", 12) + LSet("UTF-8",14) + LSet("Decoded",12))
  PrintN(LSet("Character", 11) + LSet("Code Point", 12) + LSet("Encoding",14) + LSet("Code Point",12))
  PrintN(LSet("---------", 11) + LSet("----------", 12) + LSet("-----------",14) + LSet("-----------",12))

  For i = 0 To num_codepoints
    UTF8_encode(codepoint(i), encoded_codepoint())
    dcp = UTF8_decode(encoded_codepoint()) ;Decoded UTF-8 encoding should match original codepoint that was encoded.
    PrintN(formatOutput(Chr(codepoint(i)), codepoint(i), encoded_codepoint(), dcp))
  Next
  Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
  CloseConsole()
EndIf