RosettaCodeData/Task/Unicode-strings/ALGOL-68/unicode-strings.alg

302 lines
9.9 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/local/bin/a68g --script #
# -*- coding: utf-8 -*- #
# UNICHAR/UNICODE must be printed using REPR to convert to UTF8 #
MODE UNICHAR = STRUCT(BITS #31# bits); # assuming bits width >=31 #
MODE UNICODE = FLEX[0]UNICHAR;
OP INITUNICHAR = (BITS bits)UNICHAR: (UNICHAR out; bits OF out := #ABS# bits; out);
OP INITUNICHAR = (CHAR char)UNICHAR: (UNICHAR out; bits OF out := BIN ABS char; out);
OP INITBITS = (UNICHAR unichar)BITS: #BIN# bits OF unichar;
PROC raise value error = ([]UNION(FORMAT,BITS,STRING)argv )VOID: (
putf(stand error, argv); stop
);
MODE YIELDCHAR = PROC(CHAR)VOID; MODE GENCHAR = PROC(YIELDCHAR)VOID;
MODE YIELDUNICHAR = PROC(UNICHAR)VOID; MODE GENUNICHAR = PROC(YIELDUNICHAR)VOID;
PRIO DOCONV = 1;
# Convert a stream of UNICHAR into a stream of UTFCHAR #
OP DOCONV = (GENUNICHAR gen unichar, YIELDCHAR yield)VOID:(
BITS non ascii = NOT 2r1111111;
# FOR UNICHAR unichar IN # gen unichar( # ) DO ( #
## (UNICHAR unichar)VOID: (
BITS bits := INITBITS unichar;
IF (bits AND non ascii) = 2r0 THEN # ascii #
yield(REPR ABS bits)
ELSE
FLEX[6]CHAR buf := "?"*6; # initialise work around #
INT bytes := 0;
BITS byte lead bits = 2r10000000;
FOR ofs FROM UPB buf BY -1 WHILE
bytes +:= 1;
buf[ofs]:= REPR ABS (byte lead bits OR bits AND 2r111111);
bits := bits SHR 6;
# WHILE # bits NE 2r0 DO
SKIP
OD;
BITS first byte lead bits = BIN (ABS(2r1 SHL bytes)-2) SHL (UPB buf - bytes + 1);
buf := buf[UPB buf-bytes+1:];
buf[1] := REPR ABS(BIN ABS buf[1] OR first byte lead bits);
FOR i TO UPB buf DO yield(buf[i]) OD
FI
# OD # ))
);
# Convert a STRING into a stream of UNICHAR #
OP DOCONV = (STRING string, YIELDUNICHAR yield)VOID: (
PROC gen char = (YIELDCHAR yield)VOID:
FOR i FROM LWB string TO UPB string DO yield(string[i]) OD;
gen char DOCONV yield
);
CO Prosser/Thompson UTF8 encoding scheme
Bits Last code point Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
7 U+007F 0xxxxxxx
11 U+07FF 110xxxxx 10xxxxxx
16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
END CO
# Quickly calculate the length of the UTF8 encoded string #
PROC upb utf8 = (STRING utf8 string)INT:(
INT bytes to go := 0;
INT upb := 0;
FOR i FROM LWB utf8 string TO UPB utf8 string DO
CHAR byte := utf8 string[i];
IF bytes to go = 0 THEN # start new utf char #
bytes to go :=
IF ABS byte <= ABS 2r01111111 THEN 1 # 7 bits #
ELIF ABS byte <= ABS 2r11011111 THEN 2 # 11 bits #
ELIF ABS byte <= ABS 2r11101111 THEN 3 # 16 bits #
ELIF ABS byte <= ABS 2r11110111 THEN 4 # 21 bits #
ELIF ABS byte <= ABS 2r11111011 THEN 5 # 26 bits #
ELIF ABS byte <= ABS 2r11111101 THEN 6 # 31 bits #
ELSE raise value error(("Invalid UTF-8 bytes", BIN ABS byte)); ~ FI
FI;
bytes to go -:= 1; # skip over trailing bytes #
IF bytes to go = 0 THEN upb +:= 1 FI
OD;
upb
);
# Convert a stream of CHAR into a stream of UNICHAR #
OP DOCONV = (GENCHAR gen char, YIELDUNICHAR yield)VOID: (
INT bytes to go := 0;
INT lshift;
BITS mask, out;
# FOR CHAR byte IN # gen char( # ) DO ( #
## (CHAR byte)VOID: (
INT bits := ABS byte;
IF bytes to go = 0 THEN # start new unichar #
bytes to go :=
IF bits <= ABS 2r01111111 THEN 1 # 7 bits #
ELIF bits <= ABS 2r11011111 THEN 2 # 11 bits #
ELIF bits <= ABS 2r11101111 THEN 3 # 16 bits #
ELIF bits <= ABS 2r11110111 THEN 4 # 21 bits #
ELIF bits <= ABS 2r11111011 THEN 5 # 26 bits #
ELIF bits <= ABS 2r11111101 THEN 6 # 31 bits #
ELSE raise value error(("Invalid UTF-8 bytes", BIN bits)); ~ FI;
IF bytes to go = 1 THEN
lshift := 7; mask := 2r1111111
ELSE
lshift := 7 - bytes to go; mask := BIN(ABS(2r1 SHL lshift)-1)
FI;
out := mask AND BIN bits;
lshift := 6; mask := 2r111111 # subsequently pic 6 bits at a time #
ELSE
out := (out SHL lshift) OR ( mask AND BIN bits)
FI;
bytes to go -:= 1;
IF bytes to go = 0 THEN yield(INITUNICHAR out) FI
# OD # ))
);
# Convert a string of UNICHAR into a stream of UTFCHAR #
OP DOCONV = (UNICODE unicode, YIELDCHAR yield)VOID:(
PROC gen unichar = (YIELDUNICHAR yield)VOID:
FOR i FROM LWB unicode TO UPB unicode DO yield(unicode[i]) OD;
gen unichar DOCONV yield
);
# Some convenience/shorthand U operators #
# Convert a BITS into a UNICODE char #
OP U = (BITS bits)UNICHAR:
INITUNICHAR bits;
# Convert a []BITS into a UNICODE char #
OP U = ([]BITS array bits)[]UNICHAR:(
[LWB array bits:UPB array bits]UNICHAR out;
FOR i FROM LWB array bits TO UPB array bits DO bits OF out[i]:=array bits[i] OD;
out
);
# Convert a CHAR into a UNICODE char #
OP U = (CHAR char)UNICHAR:
INITUNICHAR char;
# Convert a STRING into a UNICODE string #
OP U = (STRING utf8 string)UNICODE: (
FLEX[upb utf8(utf8 string)]UNICHAR out;
INT i := 0;
# FOR UNICHAR char IN # utf8 string DOCONV (
## (UNICHAR char)VOID:
out[i+:=1] := char
# OD #);
out
);
# Convert a UNICODE string into a UTF8 STRING #
OP REPR = (UNICODE string)STRING: (
STRING out;
# FOR CHAR char IN # string DOCONV (
## (CHAR char)VOID: (
out +:= char
# OD #));
out
);
# define the most useful OPerators on UNICODE CHARacter arrays #
# Note: LWB, UPB and slicing works as per normal #
OP + = (UNICODE a,b)UNICODE: (
[UPB a + UPB b]UNICHAR out;
out[:UPB a]:= a; out[UPB a+1:]:= b;
out
);
OP + = (UNICODE a, UNICHAR b)UNICODE: a+UNICODE(b);
OP + = (UNICHAR a, UNICODE b)UNICODE: UNICODE(a)+b;
OP + = (UNICHAR a,b)UNICODE: UNICODE(a)+b;
# Suffix a character to the end of a UNICODE string #
OP +:= = (REF UNICODE a, UNICODE b)VOID: a := a + b;
OP +:= = (REF UNICODE a, UNICHAR b)VOID: a := a + b;
# Prefix a character to the beginning of a UNICODE string #
OP +=: = (UNICODE b, REF UNICODE a)VOID: a := b + a;
OP +=: = (UNICHAR b, REF UNICODE a)VOID: a := b + a;
OP * = (UNICODE a, INT n)UNICODE: (
UNICODE out := a;
FOR i FROM 2 TO n DO out +:= a OD;
out
);
OP * = (INT n, UNICODE a)UNICODE: a * n;
OP * = (UNICHAR a, INT n)UNICODE: UNICODE(a)*n;
OP * = (INT n, UNICHAR a)UNICODE: n*UNICODE(a);
OP *:= = (REF UNICODE a, INT b)VOID: a := a * b;
# Wirthy Operators #
OP LT = (UNICHAR a,b)BOOL: ABS bits OF a LT ABS bits OF b,
LE = (UNICHAR a,b)BOOL: ABS bits OF a LE ABS bits OF b,
EQ = (UNICHAR a,b)BOOL: ABS bits OF a EQ ABS bits OF b,
NE = (UNICHAR a,b)BOOL: ABS bits OF a NE ABS bits OF b,
GE = (UNICHAR a,b)BOOL: ABS bits OF a GE ABS bits OF b,
GT = (UNICHAR a,b)BOOL: ABS bits OF a GT ABS bits OF b;
# ASCII OPerators #
OP < = (UNICHAR a,b)BOOL: a LT b,
<= = (UNICHAR a,b)BOOL: a LE b,
= = (UNICHAR a,b)BOOL: a EQ b,
/= = (UNICHAR a,b)BOOL: a NE b,
>= = (UNICHAR a,b)BOOL: a GE b,
> = (UNICHAR a,b)BOOL: a GT b;
# Non ASCII OPerators
OP ≤ = (UNICHAR a,b)BOOL: a LE b,
≠ = (UNICHAR a,b)BOOL: a NE b,
≥ = (UNICHAR a,b)BOOL: a GE b;
#
# Compare two UNICODE strings for equality #
PROC unicode cmp = (UNICODE str a,str b)INT: (
IF LWB str a > LWB str b THEN exit lt ELIF LWB str a < LWB str b THEN exit gt FI;
INT min upb = UPB(UPB str a < UPB str b | str a | str b );
FOR i FROM LWB str a TO min upb DO
UNICHAR a := str a[i], UNICHAR b := str b[i];
IF a < b THEN exit lt ELIF a > b THEN exit gt FI
OD;
IF UPB str a > UPB str b THEN exit gt ELIF UPB str a < UPB str b THEN exit lt FI;
exit eq: 0 EXIT
exit lt: -1 EXIT
exit gt: 1
);
OP LT = (UNICODE a,b)BOOL: unicode cmp(a,b)< 0,
LE = (UNICODE a,b)BOOL: unicode cmp(a,b)<=0,
EQ = (UNICODE a,b)BOOL: unicode cmp(a,b) =0,
NE = (UNICODE a,b)BOOL: unicode cmp(a,b)/=0,
GE = (UNICODE a,b)BOOL: unicode cmp(a,b)>=0,
GT = (UNICODE a,b)BOOL: unicode cmp(a,b)> 0;
# ASCII OPerators #
OP < = (UNICODE a,b)BOOL: a LT b,
<= = (UNICODE a,b)BOOL: a LE b,
= = (UNICODE a,b)BOOL: a EQ b,
/= = (UNICODE a,b)BOOL: a NE b,
>= = (UNICODE a,b)BOOL: a GE b,
> = (UNICODE a,b)BOOL: a GT b;
# Non ASCII OPerators
OP ≤ = (UNICODE a,b)BOOL: a LE b,
≠ = (UNICODE a,b)BOOL: a NE b,
≥ = (UNICODE a,b)BOOL: a GE b;
#
COMMENT - Todo: for all UNICODE and UNICHAR
Add NonASCII OPerators: ×, ×:=,
Add ASCII Operators: &, &:=, &=:
Add Wirthy OPerators: PLUSTO, PLUSAB, TIMESAB for UNICODE/UNICHAR,
Add UNICODE against UNICHAR comparison OPerators,
Add char_in_string and string_in_string PROCedures,
Add standard Unicode functions:
to_upper_case, to_lower_case, unicode_block, char_count,
get_directionality, get_numeric_value, get_type, is_defined,
is_digit, is_identifier_ignorable, is_iso_control,
is_letter, is_letter_or_digit, is_lower_case, is_mirrored,
is_space_char, is_supplementary_code_point, is_title_case,
is_unicode_identifier_part, is_unicode_identifier_start,
is_upper_case, is_valid_code_point, is_whitespace
END COMMENT
test:(
UNICHAR aircraft := U16r 2708;
printf(($"aircraft: "$, $"16r"16rdddd$, UNICODE(aircraft), $g$, " => ", REPR UNICODE(aircraft), $l$));
UNICODE chinese forty two = U16r 56db + U16r 5341 + U16r 4e8c;
printf(($"chinese forty two: "$, $g$, REPR chinese forty two, ", length string = ", UPB chinese forty two, $l$));
UNICODE poker = U "A123456789♥♦♣♠JQK";
printf(($"poker: "$, $g$, REPR poker, ", length string = ", UPB poker, $l$));
UNICODE selectric := U"×÷≤≥≠¬∨∧⏨→↓↑□⌊⌈⎩⎧○⊥¢";
printf(($"selectric: "$, $g$, REPR selectric, $l$));
printf(($"selectric*4: "$, $g$, REPR(selectric*4), $l$));
print((
"1 < 2 is ", U"1" < U"2", ", ",
"111 < 11 is ",U"111" < U"11", ", ",
"111 < 12 is ",U"111" < U"12", ", ",
"♥ < ♦ is ", U"♥" < U"♦", ", ",
"♥Q < ♥K is ",U"♥Q" < U"♥K", " & ",
"♥J < ♥K is ",U"♥J" < U"♥K", new line
))
)