This is my multi version for utf8,Whether one proc of this module can be accepted by kernel I don't mind,
hope you can check it quickly: whatever it can accept a 4 bytes utf8,maybe useful for other thing;
this is a draft,I rewrite it for this post just now.
we spend too much time on this issue.I hope we can finish it as soon as possible and go forward.
Code: Select all
MODULE CpcMyUtf8;
(*
accept 4 bytes utf8,may decode full legal utf8 bytes sequence to utf-16 char string,
which maybe displayed correctly by winapi proceduresW
*)
CONST
Truncated = 1;
IllegalBytes = 2; IllegalChars = 2;
Surrogated = 4;
(*
res =0: ok
ODD(res ): truncated string
ODD(res MOD 2): illegal bytes, illegal string
ODD(res MOD 4): has surrogate ,illegal identifier !
*)
CONST ReplaceChar = 0FFFDX;
(*full decode: decode sigle or multi illegal bytes to single ReplaceChar *)
PROCEDURE Utf8ToString* (IN utf8: ARRAY OF SHORTCHAR; OUT str: ARRAY OF CHAR; OUT res: INTEGER);
VAR
x, i, j, max: INTEGER;
state: INTEGER; (* e mm s tt e:err m:mode s:surrogate t:tail bytes *)
ch: SHORTCHAR;
surrogated, truncated: BOOLEAN;
BEGIN
res := 0; truncated := FALSE; surrogated := FALSE;
max := LEN(str) - 1; j := 0;
ch := utf8[0]; i := 0; state := 0;
WHILE (ch # 0X) & (j < max) DO
IF state = 0 THEN
CASE ch OF
| 1X(*0X*)..7FX: (* 1 bytes*)
str[j] := ch; INC(j);
| 0C2X..0DFX: (* 2 bytes *)
x := ORD(ch) - 192; state := 1; (* 0 00 0 01*) (* e=0 m=0 s=0 t=1 *)
| 0E0X..0EFX: (* 3 bytes*)
x := ORD(ch) - 224;
IF ch = 0E0X THEN state := 10; (*0 01 0 10 *) (* e=0 m=1 s=0 t=2 *)
ELSIF ch = 0EDX THEN state := 18; (*0 10 0 10 *) (* e=0 m=2 s=0 t=2 *)
ELSE state := 2; (*0 0 0 10 *) (* e=0 m=0 s=0 t=2 *)
END;
| 0F0X..0F4X: (*4 bytes *)
x := ORD(ch) - 240;
IF ch = 0F0X THEN state := 15; (*0 01 1 11 *) (* e=0 m=1 s=1 t=3 *)
ELSIF ch = 0F4X THEN state := 23; (*0 10 1 11 *) (* e=0 m=2 s=1 t=3 *)
ELSE state := 7; (* 0 00 1 11 *) (* e=0 m=0 s=1 t=3 *)
END;
ELSE (* single illegal byte *)
str[j] := ReplaceChar; INC(j); res := IllegalBytes;
END;
ELSE
CASE state OF
| 10: IF (ch < 0A0X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 18: IF (ch < 080X)OR(ch > 09FX) THEN INC(state, 32); END;
| 15: IF (ch < 090X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 23: IF (ch < 080X)OR(ch > 08FX) THEN INC(state, 32); END;
ELSE
IF (ch < 080X)OR(ch > 0BFX) THEN INC(state, 32); END;
END;
IF state < 32 THEN
DEC(state); x := x * 64 + ORD(ch) - 128;
IF state MOD 4 = 0 THEN
IF ODD(state DIV 4) THEN
surrogated := TRUE;
DEC(x, 10000H);
str[j] := CHR(0D800H + x DIV 400H); INC(j);
str[j] := CHR(0DC00H + x MOD 400H); INC(j); (*maybe j>max *)
ELSE
str[j] := CHR(x); INC(j);
END;
state := 0;
END;
ELSE(*multi illegal bytes*)
str[j] := ReplaceChar; INC(j); DEC(i); (* not consume illegal tail byte *)
res := IllegalBytes; state := 0;
END;
END;
INC(i); ch := utf8[i];
END;
IF state # 0 THEN str[j] := ReplaceChar; INC(j); res := IllegalBytes; END; (*unfinished multi legal bytes ==>illegal bytes *)
IF (ch # 0X) OR(j > max) THEN truncated := TRUE; j := max; END;
str[j] := 0X;
IF truncated THEN INC(res, Truncated); END;
IF surrogated THEN INC(res, Surrogated); END;
END Utf8ToString;
(*short decode: not decode anymore when find a illegal byte *)
PROCEDURE Utf8ToStringShort* (IN utf8: ARRAY OF SHORTCHAR; OUT str: ARRAY OF CHAR; OUT res: INTEGER);
VAR
x, i, j, max: INTEGER;
state: INTEGER; (* e mm s tt e:err m:mode s:surrogate t:tail bytes *)
ch: SHORTCHAR;
surrogated, truncated: BOOLEAN;
BEGIN
res := 0; truncated := FALSE; surrogated := FALSE;
max := LEN(str) - 1; j := 0;
ch := utf8[0]; i := 0; state := 0;
WHILE (ch # 0X) & (j < max) DO
IF state = 0 THEN
CASE ch OF
| 1X(*0X*)..7FX: (* 1 bytes*)
str[j] := ch; INC(j);
| 0C2X..0DFX: (* 2 bytes *)
x := ORD(ch) - 192; state := 1; (* 0 00 0 01*) (* e=0 m=0 s=0 t=1 *)
| 0E0X..0EFX: (* 3 bytes*)
x := ORD(ch) - 224;
IF ch = 0E0X THEN state := 10; (*0 01 0 10 *) (* e=0 m=1 s=0 t=2 *)
ELSIF ch = 0EDX THEN state := 18; (*0 10 0 10 *) (* e=0 m=2 s=0 t=2 *)
ELSE state := 2; (*0 0 0 10 *) (* e=0 m=0 s=0 t=2 *)
END;
| 0F0X..0F4X: (*4 bytes *)
x := ORD(ch) - 240;
IF ch = 0F0X THEN state := 15; (*0 01 1 11 *) (* e=0 m=1 s=1 t=3 *)
ELSIF ch = 0F4X THEN state := 23; (*0 10 1 11 *) (* e=0 m=2 s=1 t=3 *)
ELSE state := 7; (* 0 00 1 11 *) (* e=0 m=0 s=1 t=3 *)
END;
ELSE (* single illegal byte *)
(*str[j] := ReplaceChar; INC(j); *)res := IllegalBytes; RETURN; ;
END;
ELSE
CASE state OF
| 10: IF (ch < 0A0X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 18: IF (ch < 080X)OR(ch > 09FX) THEN INC(state, 32); END;
| 15: IF (ch < 090X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 23: IF (ch < 080X)OR(ch > 08FX) THEN INC(state, 32); END;
ELSE
IF (ch < 080X)OR(ch > 0BFX) THEN INC(state, 32); END;
END;
IF state < 32 THEN
DEC(state); x := x * 64 + ORD(ch) - 128;
IF state MOD 4 = 0 THEN
IF ODD(state DIV 4) THEN
surrogated := TRUE;
DEC(x, 10000H);
str[j] := CHR(0D800H + x DIV 400H); INC(j);
str[j] := CHR(0DC00H + x MOD 400H); INC(j); (*maybe j>max *)
ELSE
str[j] := CHR(x); INC(j);
END;
state := 0;
END;
ELSE(*multi illegal bytes*)
res := IllegalBytes; RETURN;
END;
END;
INC(i); ch := utf8[i];
END;
IF state # 0 THEN res := IllegalBytes; RETURN; END; (*unfinished multi legal bytes ==>illegal bytes *)
IF (ch # 0X) OR(j > max) THEN res := Truncated; RETURN; END;
str[j] := 0X;
IF surrogated THEN res := Surrogated; END;
END Utf8ToStringShort;
(* encode utf-16 string,the illegal byte decoded by Utf8ToString proc cant be recovered correctly *)
PROCEDURE StringToUtf8* (IN str: ARRAY OF CHAR; OUT utf8: ARRAY OF SHORTCHAR; OUT res: INTEGER);
VAR i, j, val, max: INTEGER; (*surr:BOOLEAN;*)
BEGIN res := 0;(* surr:=FALSE;*)
i := 0; j := 0; max := LEN(utf8) - 4;
WHILE (str[i] # 0X) & (j < max) DO
val := ORD(str[i]); INC(i);
IF (val <= 0D800H) &(val < 0E000H)THEN (*check surrogate *)
IF (val <= 0D800H) &(val < 0DC00H) &(str[i] >= 0DC00X)&(str[i] < 0E000X) THEN (*surr:=TRUE;*)
ELSE val := ORD(ReplaceChar); ; res := IllegalChars; (* illegal char*)
END;
END;
IF val < 128 THEN
utf8[j] := SHORT(CHR(val)); INC(j)
ELSIF val < 2048 THEN
utf8[j] := SHORT(CHR(val DIV 64 + 192)); INC(j);
utf8[j] := SHORT(CHR(val MOD 64 + 128)); INC(j)
ELSIF (val <= 0D800H) &(val < 0DC00H)THEN
val := (val - 0D800H) * 400H + (ORD(str[i]) - 0DC00H) + 10000H;
utf8[j] := SHORT(CHR(0F0H + val DIV 40000H)); INC(j);
utf8[j] := SHORT(CHR(80H + val DIV 4096 MOD 64)); INC(j);
utf8[j] := SHORT(CHR(80H + val DIV 64 MOD 64)); INC(j);
utf8[j] := SHORT(CHR(80H + val MOD 64)); INC(j);
INC(i);
ELSE(**)
utf8[j] := SHORT(CHR(val DIV 4096 + 224)); INC(j);
utf8[j] := SHORT(CHR(val DIV 64 MOD 64 + 128)); INC(j);
utf8[j] := SHORT(CHR(val MOD 64 + 128)); INC(j)
END;
END;
utf8[j] := 0X;
IF str[i] # 0X THEN INC(res, Truncated) END;
(*IF surr THEN INC(res, Surrogated); END;*)
END StringToUtf8;
(* full decode: PEP383 version, each illegal byte decode to DCXX *)
PROCEDURE Utf8ToString2* (IN utf8: ARRAY OF SHORTCHAR; OUT str: ARRAY OF CHAR; OUT res: INTEGER);
VAR
x, i, j, max: INTEGER;
state: INTEGER; (* e mm s tt e:err m:mode s:surrogate t:tail bytes *)
ch: SHORTCHAR;
surrogated, truncated: BOOLEAN;
VAR (*PEP383*)
buf: ARRAY 3 OF SHORTCHAR; (* buffered valid bytes*)
n: INTEGER;
BEGIN
res := 0; truncated := FALSE; surrogated := FALSE;
max := LEN(str) - 1; j := 0;
ch := utf8[0]; i := 0; state := 0;
WHILE (ch # 0X) & (j < max) DO
IF state = 0 THEN
buf[0] := ch; n := 1;
CASE ch OF
| 1X(*0X*)..7FX: (* 1 bytes*)
str[j] := ch; INC(j);
| 0C2X..0DFX: (* 2 bytes *)
x := ORD(ch) - 192; state := 1; (* 0 00 0 01*) (* e=0 m=0 s=0 t=1 *)
| 0E0X..0EFX: (* 3 bytes*)
x := ORD(ch) - 224;
IF ch = 0E0X THEN state := 10; (*0 01 0 10 *) (* e=0 m=1 s=0 t=2 *)
ELSIF ch = 0EDX THEN state := 18; (*0 10 0 10 *) (* e=0 m=2 s=0 t=2 *)
ELSE state := 2; (*0 0 0 10 *) (* e=0 m=0 s=0 t=2 *)
END;
| 0F0X..0F4X: (*4 bytes *)
x := ORD(ch) - 240;
IF ch = 0F0X THEN state := 15; (*0 01 1 11 *) (* e=0 m=1 s=1 t=3 *)
ELSIF ch = 0F4X THEN state := 23; (*0 10 1 11 *) (* e=0 m=2 s=1 t=3 *)
ELSE state := 7; (* 0 00 1 11 *) (* e=0 m=0 s=1 t=3 *)
END;
ELSE (* single illegal byte *)
str[j] := CHR(0DC00H + ORD(ch)); INC(j); res := IllegalBytes;
END;
ELSE
CASE state OF
| 10: IF (ch < 0A0X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 18: IF (ch < 080X)OR(ch > 09FX) THEN INC(state, 32); END;
| 15: IF (ch < 090X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 23: IF (ch < 080X)OR(ch > 08FX) THEN INC(state, 32); END;
ELSE
IF (ch < 080X)OR(ch > 0BFX) THEN INC(state, 32); END;
END;
IF state < 32 THEN
DEC(state); x := x * 64 + ORD(ch) - 128;
IF state MOD 4 = 0 THEN
IF ODD(state DIV 4) THEN surrogated := TRUE;
DEC(x, 10000H);
str[j] := CHR(0D800H + x DIV 400H); INC(j);
str[j] := CHR(0DC00H + x MOD 400H); INC(j); (*maybe j>max *)
ELSE
str[j] := CHR(x); INC(j);
END;
state := 0;
ELSE buf[n] := ch; INC(n);
END;
ELSE(*multi illegal bytes*)
res := IllegalBytes;
FOR x := 0 TO n - 1 DO
IF j <= max THEN str[j] := CHR(0DC00H + ORD(buf[x])); INC(j); END; (* DCXX *)
END;
DEC(i); (* not consume illegal tail byte *)
state := 0;
END;
END;
INC(i); ch := utf8[i];
END;
IF state # 0 THEN
res := IllegalBytes; (*unfinished multi legal bytes ==>illegal bytes *)
FOR x := 0 TO n - 1 DO
IF j <= max THEN str[j] := CHR(0DC00H + ORD(buf[x])); INC(j); END; (* DCXX *)
END;
END;
IF(ch # 0X)OR(j > max) THEN truncated := TRUE; j := max; END;
str[j] := 0X;
IF truncated THEN INC(res, Truncated); END;
IF surrogated THEN INC(res, Surrogated); END;
END Utf8ToString2;
(* short decode: PEP383 version, stop when find a illegal byte *)
PROCEDURE Utf8ToString2Short* (IN utf8: ARRAY OF SHORTCHAR; OUT str: ARRAY OF CHAR; OUT res: INTEGER);
VAR
x, i, j, max: INTEGER;
state: INTEGER; (* e mm s tt e:err m:mode s:surrogate t:tail bytes *)
ch: SHORTCHAR;
surrogated, truncated: BOOLEAN;
VAR (*PEP383*)
buf: ARRAY 3 OF SHORTCHAR; (* buffered valid bytes*)
n: INTEGER;
BEGIN
res := 0; truncated := FALSE; surrogated := FALSE;
max := LEN(str) - 1; j := 0;
ch := utf8[0]; i := 0; state := 0;
WHILE (ch # 0X) & (j < max) DO
IF state = 0 THEN
buf[0] := ch; n := 1;
CASE ch OF
| 1X(*0X*)..7FX: (* 1 bytes*)
str[j] := ch; INC(j);
| 0C2X..0DFX: (* 2 bytes *)
x := ORD(ch) - 192; state := 1; (* 0 00 0 01*) (* e=0 m=0 s=0 t=1 *)
| 0E0X..0EFX: (* 3 bytes*)
x := ORD(ch) - 224;
IF ch = 0E0X THEN state := 10; (*0 01 0 10 *) (* e=0 m=1 s=0 t=2 *)
ELSIF ch = 0EDX THEN state := 18; (*0 10 0 10 *) (* e=0 m=2 s=0 t=2 *)
ELSE state := 2; (*0 0 0 10 *) (* e=0 m=0 s=0 t=2 *)
END;
| 0F0X..0F4X: (*4 bytes *)
x := ORD(ch) - 240;
IF ch = 0F0X THEN state := 15; (*0 01 1 11 *) (* e=0 m=1 s=1 t=3 *)
ELSIF ch = 0F4X THEN state := 23; (*0 10 1 11 *) (* e=0 m=2 s=1 t=3 *)
ELSE state := 7; (* 0 00 1 11 *) (* e=0 m=0 s=1 t=3 *)
END;
ELSE (* single illegal byte *)
res := IllegalBytes; RETURN;
END;
ELSE
CASE state OF
| 10: IF (ch < 0A0X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 18: IF (ch < 080X)OR(ch > 09FX) THEN INC(state, 32); END;
| 15: IF (ch < 090X)OR(ch > 0BFX) THEN INC(state, 32); END;
| 23: IF (ch < 080X)OR(ch > 08FX) THEN INC(state, 32); END;
ELSE
IF (ch < 080X)OR(ch > 0BFX) THEN INC(state, 32); END;
END;
IF state < 32 THEN
DEC(state); x := x * 64 + ORD(ch) - 128;
IF state MOD 4 = 0 THEN
IF ODD(state DIV 4) THEN surrogated := TRUE;
DEC(x, 10000H);
str[j] := CHR(0D800H + x DIV 400H); INC(j);
str[j] := CHR(0DC00H + x MOD 400H); INC(j); (*maybe j>max *)
ELSE
str[j] := CHR(x); INC(j);
END;
state := 0;
ELSE buf[n] := ch; INC(n);
END;
ELSE(*multi illegal bytes*)
res := IllegalBytes; RETURN;
END;
END;
INC(i); ch := utf8[i];
END;
IF state # 0 THEN res := IllegalBytes; RETURN; END;
IF(ch # 0X)OR(j > max) THEN res := Truncated; RETURN; END;
str[j] := 0X;
IF surrogated THEN res := Surrogated; END;
END Utf8ToString2Short;
(* encode PEP383 style,DCXX encode to XX ,the illegal byte decoded by Utf8ToString2 proc can be recovered correctly *)
PROCEDURE StringToUtf82* (IN str: ARRAY OF CHAR; OUT utf8: ARRAY OF SHORTCHAR; OUT res: INTEGER);
CONST IllegalChars = 80000000H; Truncated = 1;
VAR i, j, val, max: INTEGER;
BEGIN res := 0;
i := 0; j := 0; max := LEN(utf8) - 4;
WHILE (str[i] # 0X) & (j < max) DO
val := ORD(str[i]); INC(i);
IF val < 128 THEN
utf8[j] := SHORT(CHR(val)); INC(j)
ELSIF val < 2048 THEN
utf8[j] := SHORT(CHR(val DIV 64 + 192)); INC(j);
utf8[j] := SHORT(CHR(val MOD 64 + 128)); INC(j)
ELSIF (0DC80H <= val) &(val < 0E000H)THEN (* PEP 383 illegalbyte *)
res := IllegalChars;
utf8[j] := SHORT(CHR(val MOD 100H)); INC(j);
ELSIF (0D800H <= val) &(val < 0DC00H) & (0DC00X >= str[i])&(str[i] < 0E000X)THEN (* surrogate *)
val := (val - 0D800H) * 400H + (ORD(str[i]) - 0DC00H) + 10000H;
utf8[j] := SHORT(CHR(0F0H + val DIV 40000H)); INC(j);
utf8[j] := SHORT(CHR(80H + val DIV 4096 MOD 64)); INC(j);
utf8[j] := SHORT(CHR(80H + val DIV 64 MOD 64)); INC(j);
utf8[j] := SHORT(CHR(80H + val MOD 64)); INC(j);
INC(i); (* two chars *)
ELSE(**)
utf8[j] := SHORT(CHR(val DIV 4096 + 224)); INC(j);
utf8[j] := SHORT(CHR(val DIV 64 MOD 64 + 128)); INC(j);
utf8[j] := SHORT(CHR(val MOD 64 + 128)); INC(j)
END;
END;
utf8[j] := 0X;
IF str[i] # 0X THEN INC(res, 1) (*Truncated*) END
END StringToUtf82;
END CpcMyUtf8.
luowy