1779 lines
56 KiB
Plaintext
1779 lines
56 KiB
Plaintext
(*
|
||
|
||
UTF-8 encoding and decoding in ATS2.
|
||
|
||
This is adapted from library code I wrote long ago and actually
|
||
handles the original 1-to-6-byte encoding, as well. Valid Unicode
|
||
requires only 1 to 4 bytes.
|
||
|
||
What is remarkable about the following rather lengthy code is its
|
||
use of proofs of what is and what is not valid UTF-8. Much of what
|
||
follows is proofs rather than executable code. It seemed simpler to
|
||
"mostly copy" my old code, than to try to pare that code down to a
|
||
minimum that still demonstrated some of what makes ATS different
|
||
from all but a few (if any) other languages.
|
||
|
||
*)
|
||
|
||
#define ATS_EXTERN_PREFIX "utf8_encoding_"
|
||
|
||
#include "share/atspre_define.hats"
|
||
#include "share/atspre_staload.hats"
|
||
|
||
(* A variant of ‘andalso’, with dependent types. *)
|
||
fn {}
|
||
andalso1 {b1, b2 : bool} (b1 : bool b1, b2 : bool b2) :<>
|
||
[b3 : bool | b3 == (b1 && b2)] bool b3 =
|
||
if b1 then
|
||
b2
|
||
else
|
||
false
|
||
|
||
infixl (&&) &&&
|
||
macdef &&& = andalso1
|
||
|
||
(*###################### C CODE ####################################*)
|
||
|
||
%{^
|
||
|
||
_Static_assert (4 <= sizeof (int),
|
||
"sizeof(int) must equal at least 4");
|
||
|
||
#define utf8_encoding_2_entries(v) \
|
||
(v), (v)
|
||
|
||
#define utf8_encoding_4_entries(v) \
|
||
utf8_encoding_2_entries (v), \
|
||
utf8_encoding_2_entries (v)
|
||
|
||
#define utf8_encoding_8_entries(v) \
|
||
utf8_encoding_4_entries (v), \
|
||
utf8_encoding_4_entries (v)
|
||
|
||
#define utf8_encoding_16_entries(v) \
|
||
utf8_encoding_8_entries (v), \
|
||
utf8_encoding_8_entries (v)
|
||
|
||
#define utf8_encoding_32_entries(v) \
|
||
utf8_encoding_16_entries (v), \
|
||
utf8_encoding_16_entries (v)
|
||
|
||
#define utf8_encoding_64_entries(v) \
|
||
utf8_encoding_32_entries (v), \
|
||
utf8_encoding_32_entries (v)
|
||
|
||
#define utf8_encoding_128_entries(v) \
|
||
utf8_encoding_64_entries (v), \
|
||
utf8_encoding_64_entries (v)
|
||
|
||
static const atstype_int8 utf8_encoding_extended_utf8_lengths__[256] = {
|
||
utf8_encoding_128_entries (1),
|
||
utf8_encoding_64_entries (-1),
|
||
utf8_encoding_32_entries (2),
|
||
utf8_encoding_16_entries (3),
|
||
utf8_encoding_8_entries (4),
|
||
utf8_encoding_4_entries (5),
|
||
utf8_encoding_2_entries (6),
|
||
utf8_encoding_2_entries (-1)
|
||
};
|
||
|
||
static const atstype_int8 utf8_encoding_utf8_lengths__[256] = {
|
||
utf8_encoding_128_entries (1),
|
||
utf8_encoding_64_entries (-1),
|
||
utf8_encoding_32_entries (2),
|
||
utf8_encoding_16_entries (3),
|
||
utf8_encoding_8_entries (4),
|
||
utf8_encoding_4_entries (-1),
|
||
utf8_encoding_2_entries (-1),
|
||
utf8_encoding_2_entries (-1)
|
||
};
|
||
|
||
#define utf8_encoding_extended_utf8_character_length(c) \
|
||
(utf8_encoding_extended_utf8_lengths__[(atstype_uint8) (c)])
|
||
|
||
#define utf8_encoding_utf8_character_length(c) \
|
||
(utf8_encoding_utf8_lengths__[(atstype_uint8) (c)])
|
||
|
||
%}
|
||
|
||
(*###################### INTERFACE #################################*)
|
||
|
||
stadef
|
||
is_valid_unicode_code_point (u : int) : bool =
|
||
(0x0 <= u && u < 0xD800) || (0xE000 <= u && u <= 0x10FFFF)
|
||
|
||
extern fun {}
|
||
is_valid_unicode_code_point :
|
||
{u : int} int u -<>
|
||
[b : bool | b == is_valid_unicode_code_point u]
|
||
bool b
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
stadef
|
||
is_extended_utf8_1byte_first_byte (c0 : int) : bool =
|
||
0x00 <= c0 && c0 <= 0x7F
|
||
|
||
stadef
|
||
is_extended_utf8_2byte_first_byte (c0 : int) : bool =
|
||
0xC0 <= c0 && c0 <= 0xDF
|
||
|
||
stadef
|
||
is_extended_utf8_3byte_first_byte (c0 : int) : bool =
|
||
0xE0 <= c0 && c0 <= 0xEF
|
||
|
||
stadef
|
||
is_extended_utf8_4byte_first_byte (c0 : int) : bool =
|
||
0xF0 <= c0 && c0 <= 0xF7
|
||
|
||
stadef
|
||
is_extended_utf8_5byte_first_byte (c0 : int) : bool =
|
||
0xF8 <= c0 && c0 <= 0xFB
|
||
|
||
stadef
|
||
is_extended_utf8_6byte_first_byte (c0 : int) : bool =
|
||
0xFC <= c0 && c0 <= 0xFD
|
||
|
||
stadef
|
||
extended_utf8_character_length (c0 : int) : int =
|
||
ifint (is_extended_utf8_1byte_first_byte c0, 1,
|
||
ifint (is_extended_utf8_2byte_first_byte c0, 2,
|
||
ifint (is_extended_utf8_3byte_first_byte c0, 3,
|
||
ifint (is_extended_utf8_4byte_first_byte c0, 4,
|
||
ifint (is_extended_utf8_5byte_first_byte c0, 5,
|
||
ifint (is_extended_utf8_6byte_first_byte c0, 6, ~1))))))
|
||
|
||
stadef
|
||
extended_utf8_char_length_relation (c0 : int, n : int) : bool =
|
||
(n == 1 && is_extended_utf8_1byte_first_byte c0) ||
|
||
(n == 2 && is_extended_utf8_2byte_first_byte c0) ||
|
||
(n == 3 && is_extended_utf8_3byte_first_byte c0) ||
|
||
(n == 4 && is_extended_utf8_4byte_first_byte c0) ||
|
||
(n == 5 && is_extended_utf8_5byte_first_byte c0) ||
|
||
(n == 6 && is_extended_utf8_6byte_first_byte c0)
|
||
|
||
extern prfun
|
||
extended_utf8_char_length_relation_to_length :
|
||
{c0 : int}
|
||
{n : int | extended_utf8_char_length_relation (c0, n) ||
|
||
(n == ~1 && ((c0 < 0x00) ||
|
||
(0x7F < c0 && c0 < 0xC0) ||
|
||
(0xFD < c0)))}
|
||
() -<prf> [n == extended_utf8_character_length c0] void
|
||
|
||
extern prfun
|
||
extended_utf8_char_length_to_length_relation :
|
||
{c0 : int}
|
||
{n : int | n == extended_utf8_character_length c0}
|
||
() -<prf>
|
||
[extended_utf8_char_length_relation (c0, n) ||
|
||
(n == ~1 && ((c0 < 0x00) || (0x7F < c0 && c0 < 0xC0) || (0xFD < c0)))]
|
||
void
|
||
|
||
// extended_utf8_character_length:
|
||
//
|
||
// Return value = 1, 2, 3, or 4 indicates that there may be a valid
|
||
// UTF-8 character, of the given length. It may also be a `valid'
|
||
// overlong sequence. Otherwise there definitely is not a valid
|
||
// character of any sort starting with the given byte. Return
|
||
// value = 5 or 6 indicates the possible start of an `extended
|
||
// UTF-8' character of the given length, including code points up
|
||
// to 0xffffffff. Return value = ~1 means the byte is not the
|
||
// start of a valid sequence.
|
||
extern fun
|
||
extended_utf8_character_length :
|
||
{c0 : int | 0x00 <= c0; c0 <= 0xFF} int c0 -<>
|
||
[n : int | n == extended_utf8_character_length c0] int n = "mac#%"
|
||
|
||
stadef
|
||
utf8_character_length (c0 : int) : int =
|
||
ifint (is_extended_utf8_1byte_first_byte c0, 1,
|
||
ifint (is_extended_utf8_2byte_first_byte c0, 2,
|
||
ifint (is_extended_utf8_3byte_first_byte c0, 3,
|
||
ifint (is_extended_utf8_4byte_first_byte c0, 4, ~1))))
|
||
|
||
stadef
|
||
utf8_char_length_relation (c0 : int, n : int) : bool =
|
||
(n == 1 && is_extended_utf8_1byte_first_byte c0) ||
|
||
(n == 2 && is_extended_utf8_2byte_first_byte c0) ||
|
||
(n == 3 && is_extended_utf8_3byte_first_byte c0) ||
|
||
(n == 4 && is_extended_utf8_4byte_first_byte c0)
|
||
|
||
extern prfun
|
||
utf8_char_length_relation_to_length :
|
||
{c0 : int}
|
||
{n : int | utf8_char_length_relation (c0, n) ||
|
||
(n == ~1 && ((c0 < 0x00) ||
|
||
(0x7F < c0 && c0 < 0xC0) ||
|
||
(0xF7 < c0)))}
|
||
() -<prf> [n == utf8_character_length c0] void
|
||
|
||
extern prfun
|
||
utf8_char_length_to_length_relation :
|
||
{c0 : int}
|
||
{n : int | n == utf8_character_length c0}
|
||
() -<prf>
|
||
[utf8_char_length_relation (c0, n) ||
|
||
(n == ~1 && ((c0 < 0x00) || (0x7F < c0 && c0 < 0xC0) || (0xF7 < c0)))]
|
||
void
|
||
|
||
extern fun
|
||
utf8_character_length :
|
||
{c0 : int | 0x00 <= c0; c0 <= 0xFF} int c0 -<>
|
||
[n : int | n == utf8_character_length c0] int n = "mac#%"
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
stadef
|
||
is_valid_utf8_continuation_byte (c : int) : bool =
|
||
0x80 <= c && c <= 0xBF
|
||
|
||
stadef
|
||
is_invalid_utf8_continuation_byte (c : int) : bool =
|
||
c < 0x80 || 0xBF < c
|
||
|
||
extern fun {}
|
||
is_valid_utf8_continuation_byte :
|
||
{c : int} int c -<>
|
||
[b : bool | b == is_valid_utf8_continuation_byte c] bool b
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
stadef
|
||
extended_utf8_char_1byte_decoding (c0 : int) : int =
|
||
c0
|
||
|
||
stadef
|
||
extended_utf8_char_2byte_decoding (c0 : int, c1 : int) : int =
|
||
64 * (c0 - 0xC0) + (c1 - 0x80)
|
||
|
||
stadef
|
||
extended_utf8_char_3byte_decoding (c0 : int, c1 : int, c2 : int) : int =
|
||
64 * 64 * (c0 - 0xE0) + 64 * (c1 - 0x80) + (c2 - 0x80)
|
||
|
||
stadef
|
||
extended_utf8_char_4byte_decoding (c0 : int, c1 : int, c2 : int,
|
||
c3 : int) : int =
|
||
64 * 64 * 64 * (c0 - 0xF0) + 64 * 64 * (c1 - 0x80) +
|
||
64 * (c2 - 0x80) + (c3 - 0x80)
|
||
|
||
stadef
|
||
extended_utf8_char_5byte_decoding (c0 : int, c1 : int, c2 : int,
|
||
c3 : int, c4 : int) : int =
|
||
64 * 64 * 64 * 64 * (c0 - 0xF8) + 64 * 64 * 64 * (c1 - 0x80) +
|
||
64 * 64 * (c2 - 0x80) + 64 * (c3 - 0x80) + (c4 - 0x80)
|
||
|
||
stadef
|
||
extended_utf8_char_6byte_decoding (c0 : int, c1 : int, c2 : int,
|
||
c3 : int, c4 : int, c5 : int) : int =
|
||
64 * 64 * 64 * 64 * 64 * (c0 - 0xFC) + 64 * 64 * 64 * 64 * (c1 - 0x80) +
|
||
64 * 64 * 64 * (c2 - 0x80) + 64 * 64 * (c3 - 0x80) +
|
||
64 * (c4 - 0x80) + (c5 - 0x80)
|
||
|
||
dataprop EXTENDED_UTF8_CHAR (length : int, u : int, c0 : int, c1 : int,
|
||
c2 : int, c3 : int, c4 : int, c5 : int) =
|
||
| {u, c0 : int |
|
||
0 <= u; u <= 0x7F;
|
||
is_extended_utf8_1byte_first_byte c0;
|
||
u == extended_utf8_char_1byte_decoding (c0)}
|
||
EXTENDED_UTF8_CHAR_1byte (1, u, c0, ~1, ~1, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1 : int |
|
||
0 <= u; u <= 0x7FF;
|
||
is_extended_utf8_2byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
u == extended_utf8_char_2byte_decoding (c0, c1)}
|
||
EXTENDED_UTF8_CHAR_2byte (2, u, c0, c1, ~1, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2 : int |
|
||
0 <= u; u <= 0xFFFF;
|
||
is_extended_utf8_3byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
u == extended_utf8_char_3byte_decoding (c0, c1, c2)}
|
||
EXTENDED_UTF8_CHAR_3byte (3, u, c0, c1, c2, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3 : int |
|
||
0 <= u; u <= 0x1FFFFF;
|
||
is_extended_utf8_4byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
u == extended_utf8_char_4byte_decoding (c0, c1, c2, c3)}
|
||
EXTENDED_UTF8_CHAR_4byte (4, u, c0, c1, c2, c3, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3, c4 : int |
|
||
0 <= u; u <= 0x3FFFFFF;
|
||
is_extended_utf8_5byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
is_valid_utf8_continuation_byte c4;
|
||
u == extended_utf8_char_5byte_decoding (c0, c1, c2, c3, c4)}
|
||
EXTENDED_UTF8_CHAR_5byte (5, u, c0, c1, c2, c3, c4, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3, c4, c5 : int |
|
||
0 <= u; u <= 0x7FFFFFFF;
|
||
is_extended_utf8_6byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
is_valid_utf8_continuation_byte c4;
|
||
is_valid_utf8_continuation_byte c5;
|
||
u == extended_utf8_char_6byte_decoding (c0, c1, c2, c3, c4, c5)}
|
||
EXTENDED_UTF8_CHAR_6byte (6, u, c0, c1, c2, c3, c4, c5)
|
||
|
||
extern prfun
|
||
decode_extended_utf8_istot :
|
||
{n : int | 1 <= n; n <= 6}
|
||
{c0, c1, c2, c3, c4, c5 : int |
|
||
extended_utf8_char_length_relation (c0, n);
|
||
n <= 1 || is_valid_utf8_continuation_byte c1;
|
||
n <= 2 || is_valid_utf8_continuation_byte c2;
|
||
n <= 3 || is_valid_utf8_continuation_byte c3;
|
||
n <= 4 || is_valid_utf8_continuation_byte c4;
|
||
n <= 5 || is_valid_utf8_continuation_byte c5;
|
||
1 < n || c1 == ~1;
|
||
2 < n || c2 == ~1;
|
||
3 < n || c3 == ~1;
|
||
4 < n || c4 == ~1;
|
||
5 < n || c5 == ~1}
|
||
() -<prf> [u : int] EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5)
|
||
|
||
extern prfun
|
||
decode_extended_utf8_isfun :
|
||
{na : int | 1 <= na; na <= 6}
|
||
{ua : int}
|
||
{c0a, c1a, c2a, c3a, c4a, c5a : int}
|
||
{nb : int | nb == na}
|
||
{ub : int}
|
||
{c0b, c1b, c2b, c3b, c4b, c5b : int |
|
||
c0b == c0a;
|
||
c1b == c1a;
|
||
c2b == c2a;
|
||
c3b == c3a;
|
||
c4b == c4a;
|
||
c5b == c5a}
|
||
(EXTENDED_UTF8_CHAR (na, ua, c0a, c1a, c2a, c3a, c4a, c5a),
|
||
EXTENDED_UTF8_CHAR (nb, ub, c0b, c1b, c2b, c3b, c4b, c5b)) -<prf>
|
||
[ua == ub] void
|
||
|
||
extern prfun
|
||
lemma_extended_utf8_char_length :
|
||
{n : int} {u : int} {c0, c1, c2, c3, c4, c5 : int}
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) -<prf>
|
||
[n == extended_utf8_character_length c0] void
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_1byte :
|
||
{c0 : int | 0x00 <= c0; c0 <= 0x7F} int c0 -<>
|
||
[u : int] (EXTENDED_UTF8_CHAR (1, u, c0, ~1, ~1, ~1, ~1, ~1) | int u)
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_2byte :
|
||
{c0, c1 : int | 0xC0 <= c0; c0 <= 0xDF;
|
||
is_valid_utf8_continuation_byte c1}
|
||
(int c0, int c1) -<>
|
||
[u : int] (EXTENDED_UTF8_CHAR (2, u, c0, c1, ~1, ~1, ~1, ~1) | int u)
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_3byte :
|
||
{c0, c1, c2 : int | 0xE0 <= c0; c0 <= 0xEF;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2}
|
||
(int c0, int c1, int c2) -<>
|
||
[u : int] (EXTENDED_UTF8_CHAR (3, u, c0, c1, c2, ~1, ~1, ~1) | int u)
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_4byte :
|
||
{c0, c1, c2, c3 : int | 0xF0 <= c0; c0 <= 0xF7;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3}
|
||
(int c0, int c1, int c2, int c3) -<>
|
||
[u : int] (EXTENDED_UTF8_CHAR (4, u, c0, c1, c2, c3, ~1, ~1) | int u)
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_5byte :
|
||
{c0, c1, c2, c3, c4 : int | 0xF8 <= c0; c0 <= 0xFB;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
is_valid_utf8_continuation_byte c4}
|
||
(int c0, int c1, int c2, int c3, int c4) -<>
|
||
[u : int] (EXTENDED_UTF8_CHAR (5, u, c0, c1, c2, c3, c4, ~1) | int u)
|
||
|
||
extern fun {}
|
||
decode_extended_utf8_6byte :
|
||
{c0, c1, c2, c3, c4, c5 : int | 0xFC <= c0; c0 <= 0xFD;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
is_valid_utf8_continuation_byte c4;
|
||
is_valid_utf8_continuation_byte c5}
|
||
(int c0, int c1, int c2, int c3, int c4, int c5) -<>
|
||
[u : int]
|
||
(EXTENDED_UTF8_CHAR (6, u, c0, c1, c2, c3, c4, c5) | int u)
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
stadef extended_utf8_shortest_length (u : int) =
|
||
ifint (u < 0, ~1,
|
||
ifint (u <= 0x7F, 1,
|
||
ifint (u <= 0x7FF, 2,
|
||
ifint (u <= 0xFFFF, 3,
|
||
ifint (u <= 0x1FFFFF, 4,
|
||
ifint (u <= 0x3FFFFFF, 5,
|
||
ifint (u <= 0x7FFFFFFF, 6, ~1)))))))
|
||
|
||
dataprop EXTENDED_UTF8_SHORTEST (length : int, u : int, c0 : int, c1 : int,
|
||
c2 : int, c3 : int, c4 : int, c5 : int) =
|
||
| {u, c0 : int |
|
||
0 <= u; u <= 0x7F;
|
||
c0 == u}
|
||
EXTENDED_UTF8_SHORTEST_1byte (1, u, c0, ~1, ~1, ~1, ~1, ~1) of
|
||
EXTENDED_UTF8_CHAR (1, u, c0, ~1, ~1, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1 : int |
|
||
0x7F < u; u <= 0x7FF;
|
||
c0 == 0xC0 + (u \ndiv 64);
|
||
c1 == 0x80 + (u \nmod 64)}
|
||
EXTENDED_UTF8_SHORTEST_2byte (2, u, c0, c1, ~1, ~1, ~1, ~1) of
|
||
EXTENDED_UTF8_CHAR (2, u, c0, c1, ~1, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2 : int |
|
||
0x7FF < u; u <= 0xFFFF;
|
||
c0 == 0xE0 + (u \ndiv (64 * 64));
|
||
c1 == 0x80 + ((u \ndiv 64) \nmod 64);
|
||
c2 == 0x80 + (u \nmod 64)}
|
||
EXTENDED_UTF8_SHORTEST_3byte (3, u, c0, c1, c2, ~1, ~1, ~1) of
|
||
EXTENDED_UTF8_CHAR (3, u, c0, c1, c2, ~1, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3 : int |
|
||
0xFFFF < u; u <= 0x1FFFFF;
|
||
c0 == 0xF0 + (u \ndiv (64 * 64 * 64));
|
||
c1 == 0x80 + ((u \ndiv (64 * 64)) \nmod 64);
|
||
c2 == 0x80 + ((u \ndiv 64) \nmod 64);
|
||
c3 == 0x80 + (u \nmod 64)}
|
||
EXTENDED_UTF8_SHORTEST_4byte (4, u, c0, c1, c2, c3, ~1, ~1) of
|
||
EXTENDED_UTF8_CHAR (4, u, c0, c1, c2, c3, ~1, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3, c4 : int |
|
||
0x1FFFFF < u; u <= 0x3FFFFFF;
|
||
c0 == 0xF8 + (u \ndiv (64 * 64 * 64 * 64));
|
||
c1 == 0x80 + ((u \ndiv (64 * 64 * 64)) \nmod 64);
|
||
c2 == 0x80 + ((u \ndiv (64 * 64)) \nmod 64);
|
||
c3 == 0x80 + ((u \ndiv 64) \nmod 64);
|
||
c4 == 0x80 + (u \nmod 64)}
|
||
EXTENDED_UTF8_SHORTEST_5byte (5, u, c0, c1, c2, c3, c4, ~1) of
|
||
EXTENDED_UTF8_CHAR (5, u, c0, c1, c2, c3, c4, ~1)
|
||
//
|
||
| {u, c0, c1, c2, c3, c4, c5 : int |
|
||
0x3FFFFFF < u; u <= 0x7FFFFFFF;
|
||
c0 == 0xFC + (u \ndiv (64 * 64 * 64 * 64 * 64));
|
||
c1 == 0x80 + ((u \ndiv (64 * 64 * 64 * 64)) \nmod 64);
|
||
c2 == 0x80 + ((u \ndiv (64 * 64 * 64)) \nmod 64);
|
||
c3 == 0x80 + ((u \ndiv (64 * 64)) \nmod 64);
|
||
c4 == 0x80 + ((u \ndiv 64) \nmod 64);
|
||
c5 == 0x80 + (u \nmod 64)}
|
||
EXTENDED_UTF8_SHORTEST_6byte (6, u, c0, c1, c2, c3, c4, c5) of
|
||
EXTENDED_UTF8_CHAR (6, u, c0, c1, c2, c3, c4, c5)
|
||
|
||
extern prfun
|
||
extended_utf8_shortest_is_char :
|
||
{n : int} {u : int} {c0, c1, c2, c3, c4, c5 : int}
|
||
EXTENDED_UTF8_SHORTEST (n, u, c0, c1, c2, c3, c4, c5) -<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5)
|
||
|
||
extern prfun
|
||
lemma_extended_utf8_shortest_length :
|
||
{n : int} {u : int} {c0, c1, c2, c3, c4, c5 : int}
|
||
EXTENDED_UTF8_SHORTEST (n, u, c0, c1, c2, c3, c4, c5) -<prf>
|
||
[n == extended_utf8_shortest_length u] void
|
||
|
||
extern fun {}
|
||
encode_extended_utf8_character :
|
||
{u : nat | u <= 0x7FFFFFFF}
|
||
int u -<>
|
||
[n : int] [c0, c1, c2, c3, c4, c5 : int]
|
||
@(EXTENDED_UTF8_SHORTEST (n, u, c0, c1, c2, c3, c4, c5) |
|
||
int n, int c0, int c1, int c2, int c3, int c4, int c5)
|
||
|
||
(*------------------------------------------------------------------*)
|
||
//
|
||
// A valid UTF-8 character is one that encodes a valid Unicode
|
||
// code point and is not overlong.
|
||
//
|
||
|
||
dataprop UTF8_CHAR_INVALID_CASES (c0 : int, c1 : int, c2 : int, c3 : int) =
|
||
// The cases are not mutually exclusive.
|
||
| {c0, c1, c2, c3 : int | extended_utf8_character_length c0 == ~1}
|
||
// We might not be using this case, presently (has that changed?),
|
||
// but it is included for completeness.
|
||
UTF8_CHAR_INVALID_bad_c0 (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int | 2 <= extended_utf8_character_length c0;
|
||
~(is_valid_utf8_continuation_byte c1)}
|
||
UTF8_CHAR_INVALID_bad_c1 (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int | 3 <= extended_utf8_character_length c0;
|
||
~(is_valid_utf8_continuation_byte c2)}
|
||
UTF8_CHAR_INVALID_bad_c2 (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int | extended_utf8_character_length c0 == 4;
|
||
~(is_valid_utf8_continuation_byte c3)}
|
||
UTF8_CHAR_INVALID_bad_c3 (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int |
|
||
extended_utf8_character_length c0 == 2;
|
||
extended_utf8_char_2byte_decoding (c0, c1) <= 0x7F}
|
||
UTF8_CHAR_INVALID_invalid_2byte (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int |
|
||
extended_utf8_character_length c0 == 3
|
||
&& (extended_utf8_char_3byte_decoding (c0, c1, c2) <= 0x7FF
|
||
|| ~(is_valid_unicode_code_point
|
||
(extended_utf8_char_3byte_decoding (c0, c1, c2))))}
|
||
UTF8_CHAR_INVALID_invalid_3byte (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int |
|
||
extended_utf8_character_length c0 == 4
|
||
&& (extended_utf8_char_4byte_decoding (c0, c1, c2, c3) <= 0xFFFF
|
||
|| ~(is_valid_unicode_code_point
|
||
(extended_utf8_char_4byte_decoding (c0, c1, c2, c3))))}
|
||
UTF8_CHAR_INVALID_invalid_4byte (c0, c1, c2, c3)
|
||
|
||
dataprop UTF8_CHAR_VALID_BYTES (c0 : int, c1 : int, c2 : int, c3 : int) =
|
||
| {c0 : int | 0 <= c0 && c0 <= 0x7F}
|
||
UTF8_CHAR_VALID_BYTES_1byte (c0, ~1, ~1, ~1)
|
||
| {c0, c1 : int | 0xC2 <= c0 && c0 <= 0xDF;
|
||
is_valid_utf8_continuation_byte c1}
|
||
UTF8_CHAR_VALID_BYTES_2byte (c0, c1, ~1, ~1)
|
||
| {c0, c1, c2 : int | (0xE1 <= c0 && c0 <= 0xEC)
|
||
|| c0 == 0xEE
|
||
|| c0 == 0xEF
|
||
|| (c0 == 0xE0 && 0xA0 <= c1)
|
||
|| (c0 == 0xED && c1 < 0xA0);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2}
|
||
UTF8_CHAR_VALID_BYTES_3byte (c0, c1, c2, ~1)
|
||
| {c0, c1, c2, c3 : int | (0xF1 <= c0 && c0 <= 0xF3)
|
||
|| (c0 == 0xF0 && 0x90 <= c1)
|
||
|| (c0 == 0xF4 && c1 < 0x90);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3}
|
||
UTF8_CHAR_VALID_BYTES_4byte (c0, c1, c2, c3)
|
||
|
||
dataprop UTF8_CHAR_INVALID_BYTES (c0 : int, c1 : int, c2 : int, c3 : int) =
|
||
| // This should never occur.
|
||
{c0 : int | is_extended_utf8_1byte_first_byte c0}
|
||
UTF8_CHAR_INVALID_BYTES_1byte (c0, ~1, ~1, ~1)
|
||
| {c0, c1 : int | is_extended_utf8_2byte_first_byte c0;
|
||
c0 == 0xC0 || c0 == 0xC1 ||
|
||
is_invalid_utf8_continuation_byte c1}
|
||
UTF8_CHAR_INVALID_BYTES_2byte (c0, c1, ~1, ~1)
|
||
| {c0, c1, c2 : int | is_extended_utf8_3byte_first_byte c0;
|
||
(c0 == 0xE0 && c1 < 0xA0) ||
|
||
(c0 == 0xED && 0xA0 <= c1) ||
|
||
is_invalid_utf8_continuation_byte c1 ||
|
||
is_invalid_utf8_continuation_byte c2}
|
||
UTF8_CHAR_INVALID_BYTES_3byte (c0, c1, c2, ~1)
|
||
| {c0, c1, c2, c3 : int | is_extended_utf8_4byte_first_byte c0;
|
||
0xF4 < c0 ||
|
||
(c0 == 0xF0 && c1 < 0x90) ||
|
||
(c0 == 0xF4 && 0x90 <= c1) ||
|
||
is_invalid_utf8_continuation_byte c1 ||
|
||
is_invalid_utf8_continuation_byte c2 ||
|
||
is_invalid_utf8_continuation_byte c3}
|
||
UTF8_CHAR_INVALID_BYTES_4byte (c0, c1, c2, c3)
|
||
| {c0, c1, c2, c3 : int | ~(is_extended_utf8_1byte_first_byte c0);
|
||
~(is_extended_utf8_2byte_first_byte c0);
|
||
~(is_extended_utf8_3byte_first_byte c0);
|
||
~(is_extended_utf8_4byte_first_byte c0)}
|
||
UTF8_CHAR_INVALID_BYTES_bad_c0 (c0, c1, c2, c3)
|
||
|
||
dataprop UTF8_CHAR_VALIDITY (n : int, u : int, c0 : int, c1 : int,
|
||
c2 : int, c3 : int, b : bool) =
|
||
| {n : int} {u : int | is_valid_unicode_code_point u}
|
||
{c0, c1, c2, c3 : int}
|
||
UTF8_CHAR_valid (n, u, c0, c1, c2, c3, true) of
|
||
(EXTENDED_UTF8_SHORTEST (n, u, c0, c1, c2, c3, ~1, ~1),
|
||
UTF8_CHAR_VALID_BYTES (c0, c1, c2, c3))
|
||
| {n : int} {u : int} {c0, c1, c2, c3 : int}
|
||
UTF8_CHAR_invalid (n, u, c0, c1, c2, c3, false) of
|
||
(UTF8_CHAR_INVALID_CASES (c0, c1, c2, c3),
|
||
UTF8_CHAR_INVALID_BYTES (c0, c1, c2, c3))
|
||
|
||
propdef UTF8_CHAR_VALID (n : int, u : int, c0 : int, c1 : int,
|
||
c2 : int, c3 : int) =
|
||
UTF8_CHAR_VALIDITY (n, u, c0, c1, c2, c3, true)
|
||
|
||
propdef UTF8_CHAR_INVALID (c0 : int, c1 : int, c2 : int, c3 : int) =
|
||
[n : int] [u : int] UTF8_CHAR_VALIDITY (n, u, c0, c1, c2, c3, false)
|
||
|
||
extern prfun
|
||
utf8_char_valid_implies_shortest :
|
||
{n : int} {u : int} {c0, c1, c2, c3 : int}
|
||
UTF8_CHAR_VALID (n, u, c0, c1, c2, c3) -<prf>
|
||
EXTENDED_UTF8_SHORTEST (n, u, c0, c1, c2, c3, ~1, ~1)
|
||
|
||
extern prfun
|
||
lemma_valid_utf8_character_1byte :
|
||
{u, c0 : int |
|
||
is_extended_utf8_1byte_first_byte c0;
|
||
u == extended_utf8_char_1byte_decoding c0;
|
||
0 <= u; u <= 0x7F}
|
||
() -<prf> UTF8_CHAR_VALID (1, u, c0, ~1, ~1, ~1)
|
||
|
||
extern prfun
|
||
lemma_valid_utf8_character_2byte :
|
||
{u, c0, c1 : int |
|
||
is_extended_utf8_2byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
u == extended_utf8_char_2byte_decoding (c0, c1);
|
||
0x7F < u; u <= 0x7FF}
|
||
() -<prf> UTF8_CHAR_VALID (2, u, c0, c1, ~1, ~1)
|
||
|
||
extern prfun
|
||
lemma_valid_utf8_character_3byte :
|
||
{u, c0, c1, c2 : int |
|
||
is_extended_utf8_3byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
u == extended_utf8_char_3byte_decoding (c0, c1, c2);
|
||
0x7FF < u; u <= 0xFFFF;
|
||
//
|
||
// Exclude the UTF-16 surrogate halves.
|
||
//
|
||
~(0xD800 <= u && u < 0xE000)}
|
||
() -<prf> UTF8_CHAR_VALID (3, u, c0, c1, c2, ~1)
|
||
|
||
extern prfun
|
||
lemma_valid_utf8_character_4byte :
|
||
{u, c0, c1, c2, c3 : int |
|
||
is_extended_utf8_4byte_first_byte c0;
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3;
|
||
u == extended_utf8_char_4byte_decoding (c0, c1, c2, c3);
|
||
0xFFFF < u; u <= 0x10FFFF}
|
||
() -<prf> UTF8_CHAR_VALID (4, u, c0, c1, c2, c3)
|
||
|
||
extern prfun
|
||
utf8_character_1byte_valid_bytes :
|
||
// This does not really do anything, but is included
|
||
// for completeness.
|
||
{u, c0 : int | is_extended_utf8_1byte_first_byte c0}
|
||
UTF8_CHAR_VALID (1, u, c0, ~1, ~1, ~1) -<prf> void
|
||
|
||
extern prfun
|
||
utf8_character_2byte_valid_bytes :
|
||
{u, c0, c1 : int | is_extended_utf8_2byte_first_byte c0}
|
||
UTF8_CHAR_VALID (2, u, c0, c1, ~1, ~1) -<prf>
|
||
[0xC2 <= c0; c0 <= 0xDF;
|
||
is_valid_utf8_continuation_byte c1]
|
||
void
|
||
|
||
extern prfun
|
||
utf8_character_3byte_valid_bytes :
|
||
{u, c0, c1, c2 : int | is_extended_utf8_3byte_first_byte c0}
|
||
UTF8_CHAR_VALID (3, u, c0, c1, c2, ~1) -<prf>
|
||
[(0xE1 <= c0 && c0 <= 0xEC)
|
||
|| c0 == 0xEE
|
||
|| c0 == 0xEF
|
||
|| (c0 == 0xE0 && 0xA0 <= c1)
|
||
|| (c0 == 0xED && c1 < 0xA0);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2]
|
||
void
|
||
|
||
extern prfun
|
||
utf8_character_4byte_valid_bytes :
|
||
{u, c0, c1, c2, c3 : int | is_extended_utf8_4byte_first_byte c0}
|
||
UTF8_CHAR_VALID (4, u, c0, c1, c2, c3) -<prf>
|
||
[(0xF1 <= c0 && c0 <= 0xF3)
|
||
|| (c0 == 0xF0 && 0x90 <= c1)
|
||
|| (c0 == 0xF4 && c1 < 0x90);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3]
|
||
void
|
||
|
||
extern prfun
|
||
utf8_character_1byte_invalid_bytes :
|
||
// This does not really do anything, but is included
|
||
// for completeness.
|
||
{c0 : int | is_extended_utf8_1byte_first_byte c0}
|
||
UTF8_CHAR_INVALID (c0, ~1, ~1, ~1) -<prf> void
|
||
|
||
extern prfun
|
||
utf8_character_2byte_invalid_bytes :
|
||
{c0, c1 : int | is_extended_utf8_2byte_first_byte c0}
|
||
UTF8_CHAR_INVALID (c0, c1, ~1, ~1) -<prf>
|
||
[c0 == 0xC0 || c0 == 0xc1 || is_invalid_utf8_continuation_byte c1]
|
||
void
|
||
|
||
extern prfun
|
||
utf8_character_3byte_invalid_bytes :
|
||
{c0, c1, c2 : int | is_extended_utf8_3byte_first_byte c0}
|
||
UTF8_CHAR_INVALID (c0, c1, c2, ~1) -<prf>
|
||
[(c0 == 0xE0 && c1 < 0xA0) ||
|
||
(c0 == 0xED && 0xA0 <= c1) ||
|
||
is_invalid_utf8_continuation_byte c1 ||
|
||
is_invalid_utf8_continuation_byte c2]
|
||
void
|
||
|
||
extern prfun
|
||
utf8_character_4byte_invalid_bytes :
|
||
{c0, c1, c2, c3 : int | is_extended_utf8_4byte_first_byte c0}
|
||
UTF8_CHAR_INVALID (c0, c1, c2, c3) -<prf>
|
||
[0xF4 < c0 ||
|
||
(c0 == 0xF0 && c1 < 0x90) ||
|
||
(c0 == 0xF4 && 0x90 <= c1) ||
|
||
is_invalid_utf8_continuation_byte c1 ||
|
||
is_invalid_utf8_continuation_byte c2 ||
|
||
is_invalid_utf8_continuation_byte c3]
|
||
void
|
||
|
||
extern fun {}
|
||
is_valid_utf8_character_1byte :
|
||
{c0 : int | is_extended_utf8_1byte_first_byte c0}
|
||
int c0 -<>
|
||
[b : bool | b == true] [u : int]
|
||
(UTF8_CHAR_VALIDITY (1, u, c0, ~1, ~1, ~1, b) | bool b)
|
||
|
||
extern fun {}
|
||
is_valid_utf8_character_2byte :
|
||
{c0, c1 : int | is_extended_utf8_2byte_first_byte c0}
|
||
(int c0, int c1) -<>
|
||
[b : bool] [u : int]
|
||
(UTF8_CHAR_VALIDITY (2, u, c0, c1, ~1, ~1, b) | bool b)
|
||
|
||
extern fun {}
|
||
is_valid_utf8_character_3byte :
|
||
{c0, c1, c2 : int | is_extended_utf8_3byte_first_byte c0}
|
||
(int c0, int c1, int c2) -<>
|
||
[b : bool] [u : int]
|
||
(UTF8_CHAR_VALIDITY (3, u, c0, c1, c2, ~1, b) | bool b)
|
||
|
||
extern fun {}
|
||
is_valid_utf8_character_4byte :
|
||
{c0, c1, c2, c3 : int | is_extended_utf8_4byte_first_byte c0}
|
||
(int c0, int c1, int c2, int c3) -<>
|
||
[b : bool] [u : int]
|
||
(UTF8_CHAR_VALIDITY (4, u, c0, c1, c2, c3, b) | bool b)
|
||
|
||
extern fun {}
|
||
decode_utf8_1byte :
|
||
{c0 : int | is_extended_utf8_1byte_first_byte c0}
|
||
int c0 -<> [u : int | 0 <= u; u <= 0x7F] int u
|
||
|
||
extern fun {}
|
||
decode_utf8_2byte :
|
||
{c0, c1 : int | 0xC2 <= c0; c0 <= 0xDF;
|
||
is_valid_utf8_continuation_byte c1}
|
||
(int c0, int c1) -<> [u : int | 0x7F < u; u <= 0x7FF] int u
|
||
|
||
extern fun {}
|
||
decode_utf8_3byte :
|
||
{c0, c1, c2 : int | (0xE1 <= c0 && c0 <= 0xEC)
|
||
|| c0 == 0xEE
|
||
|| c0 == 0xEF
|
||
|| (c0 == 0xE0 && 0xA0 <= c1)
|
||
|| (c0 == 0xED && c1 < 0xA0);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2}
|
||
(int c0, int c1, int c2) -<>
|
||
[u : int | 0x7FF < u; u <= 0xFFFF; u < 0xD800 || 0xE000 <= u] int u
|
||
|
||
extern fun {}
|
||
decode_utf8_4byte :
|
||
{c0, c1, c2, c3 : int | (0xF1 <= c0 && c0 <= 0xF3)
|
||
|| (c0 == 0xF0 && 0x90 <= c1)
|
||
|| (c0 == 0xF4 && c1 < 0x90);
|
||
is_valid_utf8_continuation_byte c1;
|
||
is_valid_utf8_continuation_byte c2;
|
||
is_valid_utf8_continuation_byte c3}
|
||
(int c0, int c1, int c2, int c3) -<>
|
||
[u : int | 0xFFFF < u; u <= 0x10FFFF] int u
|
||
|
||
extern fun {}
|
||
encode_utf8_character :
|
||
{u : int | is_valid_unicode_code_point u}
|
||
int u -<>
|
||
[n : int]
|
||
[c0, c1, c2, c3 : int | extended_utf8_char_length_relation (c0, n)]
|
||
@(UTF8_CHAR_VALID (n, u, c0, c1, c2, c3) |
|
||
int n, int c0, int c1, int c2, int c3)
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
#define utf8_decode_next_error_char ~1
|
||
|
||
(* Returns @(utf8_decode_next_error_char, ..) on error. *)
|
||
extern fun
|
||
utf8_array_decode_next {utf8len : int | 0 < utf8len}
|
||
{n_utf8arr : int | utf8len <= n_utf8arr}
|
||
{i : int | i < utf8len}
|
||
(utf8len : size_t utf8len,
|
||
utf8arr : &(@[char][n_utf8arr]),
|
||
i : size_t i) :<>
|
||
[c : int | is_valid_unicode_code_point c ||
|
||
c == utf8_decode_next_error_char]
|
||
[i_next : int | i_next == i + 1 || i_next == i + 2 ||
|
||
i_next == i + 3 || i_next == i + 4;
|
||
i_next <= utf8len]
|
||
@(int c, size_t i_next)
|
||
|
||
extern fun
|
||
utf8_string_decode_next {utf8len : int | 0 < utf8len}
|
||
{n_utf8str : int | utf8len <= n_utf8str}
|
||
{i : int | i < utf8len}
|
||
(utf8len : size_t utf8len,
|
||
utf8str : string n_utf8str,
|
||
i : size_t i) :<>
|
||
[c : int | is_valid_unicode_code_point c ||
|
||
c == utf8_decode_next_error_char]
|
||
[i_next : int | i_next == i + 1 || i_next == i + 2 ||
|
||
i_next == i + 3 || i_next == i + 4;
|
||
i_next <= utf8len]
|
||
@(int c, size_t i_next)
|
||
|
||
overload utf8_decode_next with utf8_array_decode_next
|
||
overload utf8_decode_next with utf8_string_decode_next
|
||
|
||
(*###################### IMPLEMENTATION ############################*)
|
||
|
||
// Integer division by 64 is equivalent to shifting right
|
||
// by 6 bits.
|
||
|
||
extern prfun _shift_right_twelve :
|
||
{x, y, z : nat | y == (x \ndiv 64);
|
||
z == (y \ndiv 64)}
|
||
(int x, int y, int z) -<prf> [z == (x \ndiv (64 * 64))] void
|
||
|
||
primplement _shift_right_twelve (x, y, z) =
|
||
()
|
||
|
||
extern prfun _shift_right_eighteen :
|
||
{x, y, z, u : nat | y == (x \ndiv 64);
|
||
z == (y \ndiv 64);
|
||
u == (z \ndiv 64)}
|
||
(int x, int y, int z, int u) -<prf> [u == (x \ndiv (64 * 64 * 64))] void
|
||
|
||
primplement _shift_right_eighteen (x, y, z, u) = ()
|
||
|
||
|
||
extern prfun _shift_right_twenty_four :
|
||
{x, y, z, u, v : nat | y == (x \ndiv 64);
|
||
z == (y \ndiv 64);
|
||
u == (z \ndiv 64);
|
||
v == (u \ndiv 64)}
|
||
(int x, int y, int z, int u, int v) -<prf>
|
||
[v == (x \ndiv (64 * 64 * 64 * 64))] void
|
||
|
||
primplement _shift_right_twenty_four (x, y, z, u, v) = ()
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
implement {}
|
||
is_valid_unicode_code_point u =
|
||
((0x0 <= u) * (u < 0xD800)) + ((0xE000 <= u) * (u <= 0x10FFFF))
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
primplement
|
||
extended_utf8_char_length_relation_to_length () = ()
|
||
|
||
primplement
|
||
extended_utf8_char_length_to_length_relation () = ()
|
||
|
||
primplement
|
||
utf8_char_length_relation_to_length () = ()
|
||
|
||
primplement
|
||
utf8_char_length_to_length_relation () = ()
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
implement {}
|
||
is_valid_utf8_continuation_byte c =
|
||
(0x80 <= c) * (c <= 0xBF)
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
primplement
|
||
decode_extended_utf8_istot {n} {c0, c1, c2, c3, c4, c5} () =
|
||
sif n == 1 then
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_1byte_decoding (c0)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_1byte ()
|
||
stadef u = extended_utf8_char_1byte_decoding (c0)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
else sif n == 2 then
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_2byte_decoding (c0, c1)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_2byte ()
|
||
stadef u = extended_utf8_char_2byte_decoding (c0, c1)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
else sif n == 3 then
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_3byte_decoding (c0, c1, c2)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_3byte ()
|
||
stadef u = extended_utf8_char_3byte_decoding (c0, c1, c2)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
else sif n == 4 then
|
||
let
|
||
prfn
|
||
make_pf {u : int |
|
||
u == extended_utf8_char_4byte_decoding (c0, c1, c2, c3)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_4byte ()
|
||
stadef u = extended_utf8_char_4byte_decoding (c0, c1, c2, c3)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
else sif n == 5 then
|
||
let
|
||
prfn
|
||
make_pf {u : int |
|
||
u == extended_utf8_char_5byte_decoding (c0, c1, c2, c3, c4)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_5byte ()
|
||
stadef u = extended_utf8_char_5byte_decoding (c0, c1, c2, c3, c4)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
else
|
||
let
|
||
prfn
|
||
make_pf {u : int |
|
||
u == extended_utf8_char_6byte_decoding (c0, c1, c2, c3, c4, c5)}
|
||
() :<prf>
|
||
EXTENDED_UTF8_CHAR (n, u, c0, c1, c2, c3, c4, c5) =
|
||
EXTENDED_UTF8_CHAR_6byte ()
|
||
stadef u = extended_utf8_char_6byte_decoding (c0, c1, c2, c3, c4, c5)
|
||
in
|
||
make_pf {u} ()
|
||
end
|
||
|
||
primplement
|
||
decode_extended_utf8_isfun {na} (pf_a, pf_b) =
|
||
sif na == 1 then
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_1byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_1byte () = pf_b
|
||
in
|
||
end
|
||
else sif na == 2 then
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_2byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_2byte () = pf_b
|
||
in
|
||
end
|
||
else sif na == 3 then
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_3byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_3byte () = pf_b
|
||
in
|
||
end
|
||
else sif na == 4 then
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_4byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_4byte () = pf_b
|
||
in
|
||
end
|
||
else sif na == 5 then
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_5byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_5byte () = pf_b
|
||
in
|
||
end
|
||
else
|
||
let
|
||
prval EXTENDED_UTF8_CHAR_6byte () = pf_a
|
||
prval EXTENDED_UTF8_CHAR_6byte () = pf_b
|
||
in
|
||
end
|
||
|
||
|
||
primplement
|
||
lemma_extended_utf8_char_length pf_char =
|
||
case+ pf_char of
|
||
| EXTENDED_UTF8_CHAR_1byte () => ()
|
||
| EXTENDED_UTF8_CHAR_2byte () => ()
|
||
| EXTENDED_UTF8_CHAR_3byte () => ()
|
||
| EXTENDED_UTF8_CHAR_4byte () => ()
|
||
| EXTENDED_UTF8_CHAR_5byte () => ()
|
||
| EXTENDED_UTF8_CHAR_6byte () => ()
|
||
|
||
implement {}
|
||
decode_extended_utf8_1byte c0 =
|
||
(EXTENDED_UTF8_CHAR_1byte () | c0)
|
||
|
||
implement {}
|
||
decode_extended_utf8_2byte (c0, c1) =
|
||
let
|
||
val u0 = c0 - 0xC0
|
||
val u1 = c1 - 0x80
|
||
in
|
||
(EXTENDED_UTF8_CHAR_2byte () | 64 * u0 + u1)
|
||
end
|
||
|
||
implement {}
|
||
decode_extended_utf8_3byte (c0, c1, c2) =
|
||
let
|
||
val u0 = c0 - 0xE0
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
in
|
||
(EXTENDED_UTF8_CHAR_3byte () | 64 * (64 * u0 + u1) + u2)
|
||
end
|
||
|
||
implement {}
|
||
decode_extended_utf8_4byte (c0, c1, c2, c3) =
|
||
let
|
||
val u0 = c0 - 0xF0
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
val u3 = c3 - 0x80
|
||
in
|
||
(EXTENDED_UTF8_CHAR_4byte () | 64 * (64 * (64 * u0 + u1) + u2) + u3)
|
||
end
|
||
|
||
implement {}
|
||
decode_extended_utf8_5byte (c0, c1, c2, c3, c4) =
|
||
let
|
||
val u0 = c0 - 0xF8
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
val u3 = c3 - 0x80
|
||
val u4 = c4 - 0x80
|
||
in
|
||
(EXTENDED_UTF8_CHAR_5byte () |
|
||
64 * (64 * (64 * (64 * u0 + u1) + u2) + u3) + u4)
|
||
end
|
||
|
||
implement {}
|
||
decode_extended_utf8_6byte (c0, c1, c2, c3, c4, c5) =
|
||
let
|
||
val u0 = c0 - 0xFC
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
val u3 = c3 - 0x80
|
||
val u4 = c4 - 0x80
|
||
val u5 = c5 - 0x80
|
||
in
|
||
(EXTENDED_UTF8_CHAR_6byte () |
|
||
64 * (64 * (64 * (64 * (64 * u0 + u1) + u2) + u3) + u4) + u5)
|
||
end
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
primplement
|
||
extended_utf8_shortest_is_char pf_shortest =
|
||
case+ pf_shortest of
|
||
| EXTENDED_UTF8_SHORTEST_1byte pf_char => pf_char
|
||
| EXTENDED_UTF8_SHORTEST_2byte pf_char => pf_char
|
||
| EXTENDED_UTF8_SHORTEST_3byte pf_char => pf_char
|
||
| EXTENDED_UTF8_SHORTEST_4byte pf_char => pf_char
|
||
| EXTENDED_UTF8_SHORTEST_5byte pf_char => pf_char
|
||
| EXTENDED_UTF8_SHORTEST_6byte pf_char => pf_char
|
||
|
||
primplement
|
||
lemma_extended_utf8_shortest_length pf_shortest =
|
||
case+ pf_shortest of
|
||
| EXTENDED_UTF8_SHORTEST_1byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_1byte () = pf_char }
|
||
| EXTENDED_UTF8_SHORTEST_2byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_2byte () = pf_char }
|
||
| EXTENDED_UTF8_SHORTEST_3byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_3byte () = pf_char }
|
||
| EXTENDED_UTF8_SHORTEST_4byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_4byte () = pf_char }
|
||
| EXTENDED_UTF8_SHORTEST_5byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_5byte () = pf_char }
|
||
| EXTENDED_UTF8_SHORTEST_6byte pf_char =>
|
||
{ prval EXTENDED_UTF8_CHAR_6byte () = pf_char }
|
||
|
||
implement {}
|
||
encode_extended_utf8_character u =
|
||
if u <= 0x7F then
|
||
let
|
||
val c0 = u
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_1byte (EXTENDED_UTF8_CHAR_1byte ()) |
|
||
1, c0, ~1, ~1, ~1, ~1, ~1)
|
||
end
|
||
else if u <= 0x7FF then
|
||
let
|
||
val c1 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c0 = 0xC0 + u1
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_2byte (EXTENDED_UTF8_CHAR_2byte ()) |
|
||
2, c0, c1, ~1, ~1, ~1, ~1)
|
||
end
|
||
else if u <= 0xFFFF then
|
||
let
|
||
val c2 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c1 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c0 = 0xE0 + u2
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_3byte (EXTENDED_UTF8_CHAR_3byte ()) |
|
||
3, c0, c1, c2, ~1, ~1, ~1)
|
||
end
|
||
else if u <= 0x1FFFFF then
|
||
let
|
||
val c3 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c2 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c1 = 0x80 + (u2 \nmod 64)
|
||
val u3 = u2 \ndiv 64
|
||
val c0 = 0xF0 + u3
|
||
prval () = _shift_right_twelve (u, u1, u2)
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_4byte (EXTENDED_UTF8_CHAR_4byte ()) |
|
||
4, c0, c1, c2, c3, ~1, ~1)
|
||
end
|
||
else if u <= 0x3FFFFFF then
|
||
let
|
||
val c4 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c3 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c2 = 0x80 + (u2 \nmod 64)
|
||
val u3 = u2 \ndiv 64
|
||
val c1 = 0x80 + (u3 \nmod 64)
|
||
val u4 = u3 \ndiv 64
|
||
val c0 = 0xF8 + u4
|
||
prval () = _shift_right_twelve (u, u1, u2)
|
||
prval () = _shift_right_eighteen (u, u1, u2, u3)
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_5byte (EXTENDED_UTF8_CHAR_5byte ()) |
|
||
5, c0, c1, c2, c3, c4, ~1)
|
||
end
|
||
else
|
||
let
|
||
val c5 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c4 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c3 = 0x80 + (u2 \nmod 64)
|
||
val u3 = u2 \ndiv 64
|
||
val c2 = 0x80 + (u3 \nmod 64)
|
||
val u4 = u3 \ndiv 64
|
||
val c1 = 0x80 + (u4 \nmod 64)
|
||
val u5 = u4 \ndiv 64
|
||
val c0 = 0xFC + u5
|
||
prval () = _shift_right_twelve (u, u1, u2)
|
||
prval () = _shift_right_eighteen (u, u1, u2, u3)
|
||
prval () = _shift_right_twenty_four (u, u1, u2, u3, u4)
|
||
in
|
||
@(EXTENDED_UTF8_SHORTEST_6byte (EXTENDED_UTF8_CHAR_6byte ()) |
|
||
6, c0, c1, c2, c3, c4, c5)
|
||
end
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
primplement
|
||
utf8_char_valid_implies_shortest pf_valid =
|
||
case+ pf_valid of
|
||
| UTF8_CHAR_valid (pf_shortest, _) => pf_shortest
|
||
|
||
primplement
|
||
lemma_valid_utf8_character_1byte {u, c0} () =
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_1byte_decoding (c0)}
|
||
() :<prf> EXTENDED_UTF8_CHAR (1, u, c0, ~1, ~1, ~1, ~1, ~1) =
|
||
EXTENDED_UTF8_CHAR_1byte ()
|
||
prval pf_char = make_pf {u} ()
|
||
prval pf_shortest = EXTENDED_UTF8_SHORTEST_1byte pf_char
|
||
prval pf_bytes = UTF8_CHAR_VALID_BYTES_1byte ()
|
||
prval pf_valid = UTF8_CHAR_valid (pf_shortest, pf_bytes)
|
||
in
|
||
pf_valid
|
||
end
|
||
|
||
primplement
|
||
lemma_valid_utf8_character_2byte {u, c0, c1} () =
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_2byte_decoding (c0, c1)}
|
||
() :<prf> EXTENDED_UTF8_CHAR (2, u, c0, c1, ~1, ~1, ~1, ~1) =
|
||
EXTENDED_UTF8_CHAR_2byte ()
|
||
prval pf_char = make_pf {u} ()
|
||
prval pf_shortest = EXTENDED_UTF8_SHORTEST_2byte pf_char
|
||
prval pf_bytes = UTF8_CHAR_VALID_BYTES_2byte ()
|
||
prval pf_valid = UTF8_CHAR_valid (pf_shortest, pf_bytes)
|
||
in
|
||
pf_valid
|
||
end
|
||
|
||
primplement
|
||
lemma_valid_utf8_character_3byte {u, c0, c1, c2} () =
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_3byte_decoding (c0, c1, c2)}
|
||
() :<prf> EXTENDED_UTF8_CHAR (3, u, c0, c1, c2, ~1, ~1, ~1) =
|
||
EXTENDED_UTF8_CHAR_3byte ()
|
||
prval pf_char = make_pf {u} ()
|
||
prfn lemma_c1 () :<prf> [c1 == 0x80 + ((u \ndiv 64) \nmod 64)] void =
|
||
{
|
||
// Convert to Horner form.
|
||
stadef u1 = 64 * (64 * (c0 - 0xE0) + (c1 - 0x80)) + (c2 - 0x80)
|
||
prval EQINT () = eqint_make {u, u1} ()
|
||
}
|
||
prval () = lemma_c1 ()
|
||
prval pf_shortest = EXTENDED_UTF8_SHORTEST_3byte pf_char
|
||
prval pf_bytes = UTF8_CHAR_VALID_BYTES_3byte ()
|
||
prval pf_valid = UTF8_CHAR_valid (pf_shortest, pf_bytes)
|
||
in
|
||
pf_valid
|
||
end
|
||
|
||
primplement
|
||
lemma_valid_utf8_character_4byte {u, c0, c1, c2, c3} () =
|
||
let
|
||
prfn
|
||
make_pf {u : int | u == extended_utf8_char_4byte_decoding (c0, c1, c2, c3)}
|
||
() :<prf> EXTENDED_UTF8_CHAR (4, u, c0, c1, c2, c3, ~1, ~1) =
|
||
EXTENDED_UTF8_CHAR_4byte ()
|
||
prval pf_char = make_pf {u} ()
|
||
prfn lemma_c1 () :<prf> [c1 == 0x80 + ((u \ndiv (64 * 64)) \nmod 64)] void =
|
||
{
|
||
prval EQINT () =
|
||
eqint_make {(64 * 64 * (c0 - 0xE0)) \ndiv (64 * 64), c0 - 0xE0} ()
|
||
prval pfd = divmod_istot {u \ndiv (64 * 64), 64} ()
|
||
prval pfm = divmod_elim pfd
|
||
prval () = mul_elim pfm
|
||
}
|
||
prval () = lemma_c1 ()
|
||
prfn lemma_c2 () :<prf> [c2 == 0x80 + ((u \ndiv 64) \nmod 64)] void =
|
||
{
|
||
// Convert to Horner form.
|
||
stadef u1 = 64 * (64 * (64 * (c0 - 0xF0) + (c1 - 0x80)) + (c2 - 0x80))
|
||
+ (c3 - 0x80)
|
||
prval EQINT () = eqint_make {u, u1} ()
|
||
}
|
||
prval () = lemma_c2 ()
|
||
prval pf_shortest = EXTENDED_UTF8_SHORTEST_4byte pf_char
|
||
prval pf_bytes = UTF8_CHAR_VALID_BYTES_4byte ()
|
||
prval pf_valid = UTF8_CHAR_valid (pf_shortest, pf_bytes)
|
||
in
|
||
pf_valid
|
||
end
|
||
|
||
primplement
|
||
utf8_character_1byte_valid_bytes pf_valid = ()
|
||
|
||
primplement
|
||
utf8_character_2byte_valid_bytes pf_valid =
|
||
{
|
||
prval UTF8_CHAR_valid (_, pf_bytes) = pf_valid
|
||
prval UTF8_CHAR_VALID_BYTES_2byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_3byte_valid_bytes pf_valid =
|
||
{
|
||
prval UTF8_CHAR_valid (_, pf_bytes) = pf_valid
|
||
prval UTF8_CHAR_VALID_BYTES_3byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_4byte_valid_bytes pf_valid =
|
||
{
|
||
prval UTF8_CHAR_valid (_, pf_bytes) = pf_valid
|
||
prval UTF8_CHAR_VALID_BYTES_4byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_1byte_invalid_bytes pf_invalid =
|
||
{
|
||
prval UTF8_CHAR_invalid (_, pf_bytes) = pf_invalid
|
||
prval UTF8_CHAR_INVALID_BYTES_1byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_2byte_invalid_bytes pf_invalid =
|
||
{
|
||
prval UTF8_CHAR_invalid (_, pf_bytes) = pf_invalid
|
||
prval UTF8_CHAR_INVALID_BYTES_2byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_3byte_invalid_bytes pf_invalid =
|
||
{
|
||
prval UTF8_CHAR_invalid (_, pf_bytes) = pf_invalid
|
||
prval UTF8_CHAR_INVALID_BYTES_3byte () = pf_bytes
|
||
}
|
||
|
||
primplement
|
||
utf8_character_4byte_invalid_bytes pf_invalid =
|
||
{
|
||
prval UTF8_CHAR_invalid (_, pf_bytes) = pf_invalid
|
||
prval UTF8_CHAR_INVALID_BYTES_4byte () = pf_bytes
|
||
}
|
||
|
||
implement {}
|
||
is_valid_utf8_character_1byte {c0} c0 =
|
||
let
|
||
stadef u = extended_utf8_char_1byte_decoding c0
|
||
in
|
||
(lemma_valid_utf8_character_1byte {u, c0} () | true)
|
||
end
|
||
|
||
implement {}
|
||
is_valid_utf8_character_2byte {c0, c1} (c0, c1) =
|
||
let
|
||
stadef u = extended_utf8_char_2byte_decoding (c0, c1)
|
||
in
|
||
if not (is_valid_utf8_continuation_byte c1) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c1 (),
|
||
UTF8_CHAR_INVALID_BYTES_2byte ()) |
|
||
false)
|
||
else if c0 < 0xC2 then
|
||
// The sequence is overlong.
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_invalid_2byte (),
|
||
UTF8_CHAR_INVALID_BYTES_2byte ()) |
|
||
false)
|
||
else
|
||
(lemma_valid_utf8_character_2byte {u, c0, c1} () | true)
|
||
end
|
||
|
||
implement {}
|
||
is_valid_utf8_character_3byte {c0, c1, c2} (c0, c1, c2) =
|
||
let
|
||
stadef u = extended_utf8_char_3byte_decoding (c0, c1, c2)
|
||
in
|
||
if not (is_valid_utf8_continuation_byte c1) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c1 (),
|
||
UTF8_CHAR_INVALID_BYTES_3byte ()) |
|
||
false)
|
||
else if not (is_valid_utf8_continuation_byte c2) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c2 (),
|
||
UTF8_CHAR_INVALID_BYTES_3byte ()) |
|
||
false)
|
||
else if (0xE1 <= c0) * (c0 <= 0xEC) then
|
||
(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | true)
|
||
else if c0 = 0xEE then
|
||
(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | true)
|
||
else if c0 = 0xEF then
|
||
(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | true)
|
||
else if (c0 = 0xE0) * (0xA0 <= c1) then
|
||
(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | true)
|
||
else if (c0 = 0xED) * (c1 < 0xA0) then
|
||
(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | true)
|
||
else
|
||
// Either the sequence is overlong or it decodes to
|
||
// an invalid code point.
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_invalid_3byte (),
|
||
UTF8_CHAR_INVALID_BYTES_3byte ()) |
|
||
false)
|
||
end
|
||
|
||
implement {}
|
||
is_valid_utf8_character_4byte {c0, c1, c2, c3} (c0, c1, c2, c3) =
|
||
let
|
||
stadef u = extended_utf8_char_4byte_decoding (c0, c1, c2, c3)
|
||
in
|
||
if not (is_valid_utf8_continuation_byte c1) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c1 (),
|
||
UTF8_CHAR_INVALID_BYTES_4byte ()) |
|
||
false)
|
||
else if not (is_valid_utf8_continuation_byte c2) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c2 (),
|
||
UTF8_CHAR_INVALID_BYTES_4byte ()) |
|
||
false)
|
||
else if not (is_valid_utf8_continuation_byte c3) then
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_bad_c3 (),
|
||
UTF8_CHAR_INVALID_BYTES_4byte ()) |
|
||
false)
|
||
else if (0xF1 <= c0) * (c0 <= 0xF3) then
|
||
(lemma_valid_utf8_character_4byte {u, c0, c1, c2, c3} () | true)
|
||
else if (c0 = 0xF0) * (0x90 <= c1) then
|
||
(lemma_valid_utf8_character_4byte {u, c0, c1, c2, c3} () | true)
|
||
else if (c0 = 0xF4) * (c1 < 0x90) then
|
||
(lemma_valid_utf8_character_4byte {u, c0, c1, c2, c3} () | true)
|
||
else
|
||
// Either the sequence is overlong or it decodes to
|
||
// an invalid code point.
|
||
(UTF8_CHAR_invalid (UTF8_CHAR_INVALID_invalid_4byte (),
|
||
UTF8_CHAR_INVALID_BYTES_4byte ()) |
|
||
false)
|
||
end
|
||
|
||
implement {}
|
||
decode_utf8_1byte c0 = c0
|
||
|
||
implement {}
|
||
decode_utf8_2byte (c0, c1) =
|
||
let
|
||
val u0 = c0 - 0xC0
|
||
val u1 = c1 - 0x80
|
||
in
|
||
64 * u0 + u1
|
||
end
|
||
|
||
implement {}
|
||
decode_utf8_3byte (c0, c1, c2) =
|
||
let
|
||
val u0 = c0 - 0xE0
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
in
|
||
64 * (64 * u0 + u1) + u2
|
||
end
|
||
|
||
implement {}
|
||
decode_utf8_4byte (c0, c1, c2, c3) =
|
||
let
|
||
val u0 = c0 - 0xF0
|
||
val u1 = c1 - 0x80
|
||
val u2 = c2 - 0x80
|
||
val u3 = c3 - 0x80
|
||
in
|
||
64 * (64 * (64 * u0 + u1) + u2) + u3
|
||
end
|
||
|
||
implement {}
|
||
encode_utf8_character {u} u =
|
||
if u <= 0x7F then
|
||
let
|
||
val c0 = u
|
||
stadef c0 = u
|
||
in
|
||
@(lemma_valid_utf8_character_1byte {u, c0} () | 1, c0, ~1, ~1, ~1)
|
||
end
|
||
else if u <= 0x7FF then
|
||
let
|
||
val c1 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c0 = 0xC0 + u1
|
||
stadef c1 = 0x80 + (u \nmod 64)
|
||
stadef u1 = u \ndiv 64
|
||
stadef c0 = 0xC0 + u1
|
||
in
|
||
@(lemma_valid_utf8_character_2byte {u, c0, c1} () | 2, c0, c1, ~1, ~1)
|
||
end
|
||
else if u <= 0xFFFF then
|
||
let
|
||
val c2 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c1 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c0 = 0xE0 + u2
|
||
stadef c2 = 0x80 + (u \nmod 64)
|
||
stadef u1 = u \ndiv 64
|
||
stadef c1 = 0x80 + (u1 \nmod 64)
|
||
stadef u2 = u1 \ndiv 64
|
||
stadef c0 = 0xE0 + u2
|
||
in
|
||
@(lemma_valid_utf8_character_3byte {u, c0, c1, c2} () | 3, c0, c1, c2, ~1)
|
||
end
|
||
else
|
||
let
|
||
val c3 = 0x80 + (u \nmod 64)
|
||
val u1 = u \ndiv 64
|
||
val c2 = 0x80 + (u1 \nmod 64)
|
||
val u2 = u1 \ndiv 64
|
||
val c1 = 0x80 + (u2 \nmod 64)
|
||
val u3 = u2 \ndiv 64
|
||
val c0 = 0xF0 + u3
|
||
stadef c3 = 0x80 + (u \nmod 64)
|
||
stadef u1 = u \ndiv 64
|
||
stadef c2 = 0x80 + (u1 \nmod 64)
|
||
stadef u2 = u1 \ndiv 64
|
||
stadef c1 = 0x80 + (u2 \nmod 64)
|
||
stadef u3 = u2 \ndiv 64
|
||
stadef c0 = 0xF0 + u3
|
||
prval () = _shift_right_twelve (u, u1, u2)
|
||
in
|
||
@(lemma_valid_utf8_character_4byte {u, c0, c1, c2, c3} () |
|
||
4, c0, c1, c2, c3)
|
||
end
|
||
|
||
(*------------------------------------------------------------------*)
|
||
|
||
implement
|
||
utf8_array_decode_next (utf8len, utf8arr, i) =
|
||
let
|
||
macdef getchr (j) = g1ofg0 (char2u2i (utf8arr[,(j)]))
|
||
macdef error_return = @(utf8_decode_next_error_char, i + i2sz 1)
|
||
|
||
prval _ = lemma_g1uint_param (i)
|
||
|
||
val c0 = getchr (i)
|
||
in
|
||
if 0x00 <= c0 &&& c0 <= 0xFF then
|
||
let
|
||
val seqlen = utf8_character_length c0
|
||
in
|
||
case+ seqlen of
|
||
| 1 => @(c0, i + i2sz 1)
|
||
|
||
| 2 =>
|
||
if utf8len < i + i2sz 2 then
|
||
error_return
|
||
else
|
||
let
|
||
val c1 = getchr (i + i2sz 1)
|
||
val (pf | valid) =
|
||
is_valid_utf8_character_2byte (c0, c1)
|
||
in
|
||
if valid then
|
||
let
|
||
prval _ = utf8_character_2byte_valid_bytes pf
|
||
val code_point = decode_utf8_2byte (c0, c1)
|
||
in
|
||
@(code_point, i + i2sz 2)
|
||
end
|
||
else
|
||
error_return
|
||
end
|
||
|
||
| 3 =>
|
||
if utf8len < i + i2sz 3 then
|
||
error_return
|
||
else
|
||
let
|
||
val c1 = getchr (i + i2sz 1)
|
||
val c2 = getchr (i + i2sz 2)
|
||
val (pf | valid) =
|
||
is_valid_utf8_character_3byte (c0, c1, c2)
|
||
in
|
||
if valid then
|
||
let
|
||
prval _ = utf8_character_3byte_valid_bytes pf
|
||
val code_point = decode_utf8_3byte (c0, c1, c2)
|
||
in
|
||
@(code_point, i + i2sz 3)
|
||
end
|
||
else
|
||
error_return
|
||
end
|
||
|
||
| 4 =>
|
||
if utf8len < i + i2sz 4 then
|
||
error_return
|
||
else
|
||
let
|
||
val c1 = getchr (i + i2sz 1)
|
||
val c2 = getchr (i + i2sz 2)
|
||
val c3 = getchr (i + i2sz 3)
|
||
val (pf | valid) =
|
||
is_valid_utf8_character_4byte (c0, c1, c2, c3)
|
||
in
|
||
if valid then
|
||
let
|
||
prval _ = utf8_character_4byte_valid_bytes pf
|
||
val code_point = decode_utf8_4byte (c0, c1, c2, c3)
|
||
in
|
||
@(code_point, i + i2sz 4)
|
||
end
|
||
else
|
||
error_return
|
||
end
|
||
|
||
| _ => error_return
|
||
end
|
||
else
|
||
(* This branch should never be run on a system
|
||
with 8-bit char. *)
|
||
error_return
|
||
end
|
||
|
||
implement
|
||
utf8_string_decode_next {..} {n_utf8str} (utf8len, utf8str, i) =
|
||
let
|
||
val [p : addr] p = string2ptr utf8str
|
||
val (pf, consume_pf | p) =
|
||
$UNSAFE.ptr1_vtake{@[char][n_utf8str]} p
|
||
val result = utf8_array_decode_next (utf8len, !p, i)
|
||
prval _ = consume_pf pf
|
||
in
|
||
result
|
||
end
|
||
|
||
(*###################### DEMONSTRATION #############################*)
|
||
|
||
fn
|
||
encode_LATIN_CAPITAL_LETTER_A () : void =
|
||
{
|
||
val u = 0x0041
|
||
|
||
(* Return both a proof of valid encoding and the encoding. *)
|
||
val (pf_valid | n, c0, c1, c2, c3) = encode_utf8_character (u)
|
||
|
||
(* Verify the encoding. *)
|
||
val _ = assertloc (n = 1)
|
||
val _ = assertloc (c0 = 0x41)
|
||
}
|
||
|
||
fn
|
||
decode_LATIN_CAPITAL_LETTER_A () : void =
|
||
{
|
||
val str = "\x41\0"
|
||
val n = length str
|
||
val (c, i) = utf8_decode_next (n, str, i2sz 0)
|
||
|
||
(* Verify that the decoding is correct. *)
|
||
val _ = assertloc (c = 0x0041)
|
||
|
||
(* Verify that the next index is 1. *)
|
||
val _ = assertloc (i = i2sz 1)
|
||
}
|
||
|
||
fn
|
||
encode_LATIN_SMALL_LETTER_O_WITH_DIAERESIS () : void =
|
||
{
|
||
val u = 0x00F6
|
||
|
||
(* Return both a proof of valid encoding and the encoding. *)
|
||
val (pf_valid | n, c0, c1, c2, c3) = encode_utf8_character (u)
|
||
|
||
(* Verify the encoding. *)
|
||
val _ = assertloc (n = 2)
|
||
val _ = assertloc (c0 = 0xC3)
|
||
val _ = assertloc (c1 = 0xB6)
|
||
}
|
||
|
||
fn
|
||
decode_LATIN_SMALL_LETTER_O_WITH_DIAERESIS () : void =
|
||
{
|
||
val str = "\xC3\xB6\0"
|
||
val n = length str
|
||
val (c, i) = utf8_decode_next (n, str, i2sz 0)
|
||
|
||
(* Verify that the decoding is correct. *)
|
||
val _ = assertloc (c = 0x00F6)
|
||
|
||
(* Verify that the next index is 2. *)
|
||
val _ = assertloc (i = i2sz 2)
|
||
}
|
||
|
||
fn
|
||
encode_CYRILLIC_CAPITAL_LETTER_ZHE () : void =
|
||
{
|
||
val u = 0x0416
|
||
|
||
(* Return both a proof of valid encoding and the encoding. *)
|
||
val (pf_valid | n, c0, c1, c2, c3) = encode_utf8_character (u)
|
||
|
||
(* Verify the encoding. *)
|
||
val _ = assertloc (n = 2)
|
||
val _ = assertloc (c0 = 0xD0)
|
||
val _ = assertloc (c1 = 0x96)
|
||
}
|
||
|
||
fn
|
||
decode_CYRILLIC_CAPITAL_LETTER_ZHE () : void =
|
||
{
|
||
val str = "\xD0\x96\0"
|
||
val n = length str
|
||
val (c, i) = utf8_decode_next (n, str, i2sz 0)
|
||
|
||
(* Verify that the decoding is correct. *)
|
||
val _ = assertloc (c = 0x0416)
|
||
|
||
(* Verify that the next index is 2. *)
|
||
val _ = assertloc (i = i2sz 2)
|
||
}
|
||
|
||
fn
|
||
encode_EURO_SIGN () : void =
|
||
{
|
||
val u = 0x20AC
|
||
|
||
(* Return both a proof of valid encoding and the encoding. *)
|
||
val (pf_valid | n, c0, c1, c2, c3) = encode_utf8_character (u)
|
||
|
||
(* Verify the encoding. *)
|
||
val _ = assertloc (n = 3)
|
||
val _ = assertloc (c0 = 0xE2)
|
||
val _ = assertloc (c1 = 0x82)
|
||
val _ = assertloc (c2 = 0xAC)
|
||
}
|
||
|
||
fn
|
||
decode_EURO_SIGN () : void =
|
||
{
|
||
val str = "\xE2\x82\xAC\0"
|
||
val n = length str
|
||
val (c, i) = utf8_decode_next (n, str, i2sz 0)
|
||
|
||
(* Verify that the decoding is correct. *)
|
||
val _ = assertloc (c = 0x20AC)
|
||
|
||
(* Verify that the next index is 3. *)
|
||
val _ = assertloc (i = i2sz 3)
|
||
}
|
||
|
||
fn
|
||
encode_MUSICAL_SYMBOL_G_CLEF () : void =
|
||
{
|
||
val u = 0x1D11E
|
||
|
||
(* Return both a proof of valid encoding and the encoding. *)
|
||
val (pf_valid | n, c0, c1, c2, c3) = encode_utf8_character (u)
|
||
|
||
(* Verify the encoding. *)
|
||
val _ = assertloc (n = 4)
|
||
val _ = assertloc (c0 = 0xF0)
|
||
val _ = assertloc (c1 = 0x9D)
|
||
val _ = assertloc (c2 = 0x84)
|
||
val _ = assertloc (c3 = 0x9E)
|
||
}
|
||
|
||
fn
|
||
decode_MUSICAL_SYMBOL_G_CLEF () : void =
|
||
{
|
||
val str = "\xF0\x9D\x84\x9E\0"
|
||
val n = length str
|
||
val (c, i) = utf8_decode_next (n, str, i2sz 0)
|
||
|
||
(* Verify that the decoding is correct. *)
|
||
val _ = assertloc (c = 0x1D11E)
|
||
|
||
(* Verify that the next index is 4. *)
|
||
val _ = assertloc (i = i2sz 4)
|
||
}
|
||
|
||
implement
|
||
main0 () =
|
||
begin
|
||
encode_LATIN_CAPITAL_LETTER_A ();
|
||
decode_LATIN_CAPITAL_LETTER_A ();
|
||
|
||
encode_LATIN_SMALL_LETTER_O_WITH_DIAERESIS ();
|
||
decode_LATIN_SMALL_LETTER_O_WITH_DIAERESIS ();
|
||
|
||
encode_CYRILLIC_CAPITAL_LETTER_ZHE ();
|
||
decode_CYRILLIC_CAPITAL_LETTER_ZHE ();
|
||
|
||
encode_EURO_SIGN ();
|
||
decode_EURO_SIGN ();
|
||
|
||
encode_MUSICAL_SYMBOL_G_CLEF ();
|
||
decode_MUSICAL_SYMBOL_G_CLEF ();
|
||
|
||
println! ("SUCCESS")
|
||
end
|