302 lines
9.9 KiB
Plaintext
302 lines
9.9 KiB
Plaintext
#!/usr/local/bin/a68g --script #
|
||
# -*- coding: utf-8 -*- #
|
||
|
||
# UNICHAR/UNICODE must be printed using REPR to convert to UTF8 #
|
||
|
||
MODE UNICHAR = STRUCT(BITS #31# bits); # assuming bits width >=31 #
|
||
MODE UNICODE = FLEX[0]UNICHAR;
|
||
|
||
OP INITUNICHAR = (BITS bits)UNICHAR: (UNICHAR out; bits OF out := #ABS# bits; out);
|
||
OP INITUNICHAR = (CHAR char)UNICHAR: (UNICHAR out; bits OF out := BIN ABS char; out);
|
||
OP INITBITS = (UNICHAR unichar)BITS: #BIN# bits OF unichar;
|
||
|
||
PROC raise value error = ([]UNION(FORMAT,BITS,STRING)argv )VOID: (
|
||
putf(stand error, argv); stop
|
||
);
|
||
|
||
MODE YIELDCHAR = PROC(CHAR)VOID; MODE GENCHAR = PROC(YIELDCHAR)VOID;
|
||
MODE YIELDUNICHAR = PROC(UNICHAR)VOID; MODE GENUNICHAR = PROC(YIELDUNICHAR)VOID;
|
||
|
||
PRIO DOCONV = 1;
|
||
|
||
# Convert a stream of UNICHAR into a stream of UTFCHAR #
|
||
OP DOCONV = (GENUNICHAR gen unichar, YIELDCHAR yield)VOID:(
|
||
BITS non ascii = NOT 2r1111111;
|
||
# FOR UNICHAR unichar IN # gen unichar( # ) DO ( #
|
||
## (UNICHAR unichar)VOID: (
|
||
BITS bits := INITBITS unichar;
|
||
IF (bits AND non ascii) = 2r0 THEN # ascii #
|
||
yield(REPR ABS bits)
|
||
ELSE
|
||
FLEX[6]CHAR buf := "?"*6; # initialise work around #
|
||
INT bytes := 0;
|
||
BITS byte lead bits = 2r10000000;
|
||
FOR ofs FROM UPB buf BY -1 WHILE
|
||
bytes +:= 1;
|
||
buf[ofs]:= REPR ABS (byte lead bits OR bits AND 2r111111);
|
||
bits := bits SHR 6;
|
||
# WHILE # bits NE 2r0 DO
|
||
SKIP
|
||
OD;
|
||
BITS first byte lead bits = BIN (ABS(2r1 SHL bytes)-2) SHL (UPB buf - bytes + 1);
|
||
buf := buf[UPB buf-bytes+1:];
|
||
buf[1] := REPR ABS(BIN ABS buf[1] OR first byte lead bits);
|
||
FOR i TO UPB buf DO yield(buf[i]) OD
|
||
FI
|
||
# OD # ))
|
||
);
|
||
|
||
# Convert a STRING into a stream of UNICHAR #
|
||
OP DOCONV = (STRING string, YIELDUNICHAR yield)VOID: (
|
||
PROC gen char = (YIELDCHAR yield)VOID:
|
||
FOR i FROM LWB string TO UPB string DO yield(string[i]) OD;
|
||
gen char DOCONV yield
|
||
);
|
||
|
||
CO Prosser/Thompson UTF8 encoding scheme
|
||
Bits Last code point Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
|
||
7 U+007F 0xxxxxxx
|
||
11 U+07FF 110xxxxx 10xxxxxx
|
||
16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||
21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
END CO
|
||
|
||
# Quickly calculate the length of the UTF8 encoded string #
|
||
PROC upb utf8 = (STRING utf8 string)INT:(
|
||
INT bytes to go := 0;
|
||
INT upb := 0;
|
||
FOR i FROM LWB utf8 string TO UPB utf8 string DO
|
||
CHAR byte := utf8 string[i];
|
||
IF bytes to go = 0 THEN # start new utf char #
|
||
bytes to go :=
|
||
IF ABS byte <= ABS 2r01111111 THEN 1 # 7 bits #
|
||
ELIF ABS byte <= ABS 2r11011111 THEN 2 # 11 bits #
|
||
ELIF ABS byte <= ABS 2r11101111 THEN 3 # 16 bits #
|
||
ELIF ABS byte <= ABS 2r11110111 THEN 4 # 21 bits #
|
||
ELIF ABS byte <= ABS 2r11111011 THEN 5 # 26 bits #
|
||
ELIF ABS byte <= ABS 2r11111101 THEN 6 # 31 bits #
|
||
ELSE raise value error(("Invalid UTF-8 bytes", BIN ABS byte)); ~ FI
|
||
FI;
|
||
bytes to go -:= 1; # skip over trailing bytes #
|
||
IF bytes to go = 0 THEN upb +:= 1 FI
|
||
OD;
|
||
upb
|
||
);
|
||
|
||
# Convert a stream of CHAR into a stream of UNICHAR #
|
||
OP DOCONV = (GENCHAR gen char, YIELDUNICHAR yield)VOID: (
|
||
INT bytes to go := 0;
|
||
INT lshift;
|
||
BITS mask, out;
|
||
|
||
# FOR CHAR byte IN # gen char( # ) DO ( #
|
||
## (CHAR byte)VOID: (
|
||
INT bits := ABS byte;
|
||
IF bytes to go = 0 THEN # start new unichar #
|
||
bytes to go :=
|
||
IF bits <= ABS 2r01111111 THEN 1 # 7 bits #
|
||
ELIF bits <= ABS 2r11011111 THEN 2 # 11 bits #
|
||
ELIF bits <= ABS 2r11101111 THEN 3 # 16 bits #
|
||
ELIF bits <= ABS 2r11110111 THEN 4 # 21 bits #
|
||
ELIF bits <= ABS 2r11111011 THEN 5 # 26 bits #
|
||
ELIF bits <= ABS 2r11111101 THEN 6 # 31 bits #
|
||
ELSE raise value error(("Invalid UTF-8 bytes", BIN bits)); ~ FI;
|
||
IF bytes to go = 1 THEN
|
||
lshift := 7; mask := 2r1111111
|
||
ELSE
|
||
lshift := 7 - bytes to go; mask := BIN(ABS(2r1 SHL lshift)-1)
|
||
FI;
|
||
out := mask AND BIN bits;
|
||
|
||
lshift := 6; mask := 2r111111 # subsequently pic 6 bits at a time #
|
||
ELSE
|
||
out := (out SHL lshift) OR ( mask AND BIN bits)
|
||
FI;
|
||
bytes to go -:= 1;
|
||
IF bytes to go = 0 THEN yield(INITUNICHAR out) FI
|
||
# OD # ))
|
||
);
|
||
|
||
# Convert a string of UNICHAR into a stream of UTFCHAR #
|
||
OP DOCONV = (UNICODE unicode, YIELDCHAR yield)VOID:(
|
||
PROC gen unichar = (YIELDUNICHAR yield)VOID:
|
||
FOR i FROM LWB unicode TO UPB unicode DO yield(unicode[i]) OD;
|
||
gen unichar DOCONV yield
|
||
);
|
||
|
||
# Some convenience/shorthand U operators #
|
||
# Convert a BITS into a UNICODE char #
|
||
OP U = (BITS bits)UNICHAR:
|
||
INITUNICHAR bits;
|
||
|
||
# Convert a []BITS into a UNICODE char #
|
||
OP U = ([]BITS array bits)[]UNICHAR:(
|
||
[LWB array bits:UPB array bits]UNICHAR out;
|
||
FOR i FROM LWB array bits TO UPB array bits DO bits OF out[i]:=array bits[i] OD;
|
||
out
|
||
);
|
||
|
||
# Convert a CHAR into a UNICODE char #
|
||
OP U = (CHAR char)UNICHAR:
|
||
INITUNICHAR char;
|
||
|
||
# Convert a STRING into a UNICODE string #
|
||
OP U = (STRING utf8 string)UNICODE: (
|
||
FLEX[upb utf8(utf8 string)]UNICHAR out;
|
||
INT i := 0;
|
||
# FOR UNICHAR char IN # utf8 string DOCONV (
|
||
## (UNICHAR char)VOID:
|
||
out[i+:=1] := char
|
||
# OD #);
|
||
out
|
||
);
|
||
|
||
# Convert a UNICODE string into a UTF8 STRING #
|
||
OP REPR = (UNICODE string)STRING: (
|
||
STRING out;
|
||
# FOR CHAR char IN # string DOCONV (
|
||
## (CHAR char)VOID: (
|
||
out +:= char
|
||
# OD #));
|
||
out
|
||
);
|
||
|
||
# define the most useful OPerators on UNICODE CHARacter arrays #
|
||
# Note: LWB, UPB and slicing works as per normal #
|
||
|
||
OP + = (UNICODE a,b)UNICODE: (
|
||
[UPB a + UPB b]UNICHAR out;
|
||
out[:UPB a]:= a; out[UPB a+1:]:= b;
|
||
out
|
||
);
|
||
|
||
OP + = (UNICODE a, UNICHAR b)UNICODE: a+UNICODE(b);
|
||
OP + = (UNICHAR a, UNICODE b)UNICODE: UNICODE(a)+b;
|
||
OP + = (UNICHAR a,b)UNICODE: UNICODE(a)+b;
|
||
|
||
# Suffix a character to the end of a UNICODE string #
|
||
OP +:= = (REF UNICODE a, UNICODE b)VOID: a := a + b;
|
||
OP +:= = (REF UNICODE a, UNICHAR b)VOID: a := a + b;
|
||
|
||
# Prefix a character to the beginning of a UNICODE string #
|
||
OP +=: = (UNICODE b, REF UNICODE a)VOID: a := b + a;
|
||
OP +=: = (UNICHAR b, REF UNICODE a)VOID: a := b + a;
|
||
|
||
OP * = (UNICODE a, INT n)UNICODE: (
|
||
UNICODE out := a;
|
||
FOR i FROM 2 TO n DO out +:= a OD;
|
||
out
|
||
);
|
||
OP * = (INT n, UNICODE a)UNICODE: a * n;
|
||
|
||
OP * = (UNICHAR a, INT n)UNICODE: UNICODE(a)*n;
|
||
OP * = (INT n, UNICHAR a)UNICODE: n*UNICODE(a);
|
||
|
||
OP *:= = (REF UNICODE a, INT b)VOID: a := a * b;
|
||
|
||
# Wirthy Operators #
|
||
OP LT = (UNICHAR a,b)BOOL: ABS bits OF a LT ABS bits OF b,
|
||
LE = (UNICHAR a,b)BOOL: ABS bits OF a LE ABS bits OF b,
|
||
EQ = (UNICHAR a,b)BOOL: ABS bits OF a EQ ABS bits OF b,
|
||
NE = (UNICHAR a,b)BOOL: ABS bits OF a NE ABS bits OF b,
|
||
GE = (UNICHAR a,b)BOOL: ABS bits OF a GE ABS bits OF b,
|
||
GT = (UNICHAR a,b)BOOL: ABS bits OF a GT ABS bits OF b;
|
||
|
||
# ASCII OPerators #
|
||
OP < = (UNICHAR a,b)BOOL: a LT b,
|
||
<= = (UNICHAR a,b)BOOL: a LE b,
|
||
= = (UNICHAR a,b)BOOL: a EQ b,
|
||
/= = (UNICHAR a,b)BOOL: a NE b,
|
||
>= = (UNICHAR a,b)BOOL: a GE b,
|
||
> = (UNICHAR a,b)BOOL: a GT b;
|
||
|
||
# Non ASCII OPerators
|
||
OP ≤ = (UNICHAR a,b)BOOL: a LE b,
|
||
≠ = (UNICHAR a,b)BOOL: a NE b,
|
||
≥ = (UNICHAR a,b)BOOL: a GE b;
|
||
#
|
||
|
||
# Compare two UNICODE strings for equality #
|
||
PROC unicode cmp = (UNICODE str a,str b)INT: (
|
||
|
||
IF LWB str a > LWB str b THEN exit lt ELIF LWB str a < LWB str b THEN exit gt FI;
|
||
|
||
INT min upb = UPB(UPB str a < UPB str b | str a | str b );
|
||
|
||
FOR i FROM LWB str a TO min upb DO
|
||
UNICHAR a := str a[i], UNICHAR b := str b[i];
|
||
IF a < b THEN exit lt ELIF a > b THEN exit gt FI
|
||
OD;
|
||
|
||
IF UPB str a > UPB str b THEN exit gt ELIF UPB str a < UPB str b THEN exit lt FI;
|
||
|
||
exit eq: 0 EXIT
|
||
exit lt: -1 EXIT
|
||
exit gt: 1
|
||
);
|
||
|
||
OP LT = (UNICODE a,b)BOOL: unicode cmp(a,b)< 0,
|
||
LE = (UNICODE a,b)BOOL: unicode cmp(a,b)<=0,
|
||
EQ = (UNICODE a,b)BOOL: unicode cmp(a,b) =0,
|
||
NE = (UNICODE a,b)BOOL: unicode cmp(a,b)/=0,
|
||
GE = (UNICODE a,b)BOOL: unicode cmp(a,b)>=0,
|
||
GT = (UNICODE a,b)BOOL: unicode cmp(a,b)> 0;
|
||
|
||
# ASCII OPerators #
|
||
OP < = (UNICODE a,b)BOOL: a LT b,
|
||
<= = (UNICODE a,b)BOOL: a LE b,
|
||
= = (UNICODE a,b)BOOL: a EQ b,
|
||
/= = (UNICODE a,b)BOOL: a NE b,
|
||
>= = (UNICODE a,b)BOOL: a GE b,
|
||
> = (UNICODE a,b)BOOL: a GT b;
|
||
|
||
# Non ASCII OPerators
|
||
OP ≤ = (UNICODE a,b)BOOL: a LE b,
|
||
≠ = (UNICODE a,b)BOOL: a NE b,
|
||
≥ = (UNICODE a,b)BOOL: a GE b;
|
||
#
|
||
|
||
COMMENT - Todo: for all UNICODE and UNICHAR
|
||
Add NonASCII OPerators: ×, ×:=,
|
||
Add ASCII Operators: &, &:=, &=:
|
||
Add Wirthy OPerators: PLUSTO, PLUSAB, TIMESAB for UNICODE/UNICHAR,
|
||
Add UNICODE against UNICHAR comparison OPerators,
|
||
Add char_in_string and string_in_string PROCedures,
|
||
Add standard Unicode functions:
|
||
to_upper_case, to_lower_case, unicode_block, char_count,
|
||
get_directionality, get_numeric_value, get_type, is_defined,
|
||
is_digit, is_identifier_ignorable, is_iso_control,
|
||
is_letter, is_letter_or_digit, is_lower_case, is_mirrored,
|
||
is_space_char, is_supplementary_code_point, is_title_case,
|
||
is_unicode_identifier_part, is_unicode_identifier_start,
|
||
is_upper_case, is_valid_code_point, is_whitespace
|
||
END COMMENT
|
||
|
||
test:(
|
||
|
||
UNICHAR aircraft := U16r 2708;
|
||
printf(($"aircraft: "$, $"16r"16rdddd$, UNICODE(aircraft), $g$, " => ", REPR UNICODE(aircraft), $l$));
|
||
|
||
UNICODE chinese forty two = U16r 56db + U16r 5341 + U16r 4e8c;
|
||
printf(($"chinese forty two: "$, $g$, REPR chinese forty two, ", length string = ", UPB chinese forty two, $l$));
|
||
|
||
UNICODE poker = U "A123456789♥♦♣♠JQK";
|
||
printf(($"poker: "$, $g$, REPR poker, ", length string = ", UPB poker, $l$));
|
||
|
||
UNICODE selectric := U"×÷≤≥≠¬∨∧⏨→↓↑□⌊⌈⎩⎧○⊥¢";
|
||
printf(($"selectric: "$, $g$, REPR selectric, $l$));
|
||
printf(($"selectric*4: "$, $g$, REPR(selectric*4), $l$));
|
||
|
||
print((
|
||
"1 < 2 is ", U"1" < U"2", ", ",
|
||
"111 < 11 is ",U"111" < U"11", ", ",
|
||
"111 < 12 is ",U"111" < U"12", ", ",
|
||
"♥ < ♦ is ", U"♥" < U"♦", ", ",
|
||
"♥Q < ♥K is ",U"♥Q" < U"♥K", " & ",
|
||
"♥J < ♥K is ",U"♥J" < U"♥K", new line
|
||
))
|
||
|
||
)
|