"Unicode for Component Pascal identifiers" is mainly Helmut's solution.
We Just choosing between realization of
Utf8ToString converter in
Kernel.
The version by Josef is based on Helmut's version:
Code: Select all
PROCEDURE Utf8ToString* (IN in : ARRAY OF SHORTCHAR; OUT out : ARRAY OF CHAR; OUT res: INTEGER);
VAR i, j, val, max : INTEGER; ch : SHORTCHAR;
PROCEDURE FormatError();
BEGIN out := in$; res := 2 (*format error*)
END FormatError;
BEGIN
ch := in[0]; i := 1; j := 0; max := LEN(out) - 1;
WHILE (ch # 0X) & (j < max) DO
IF ch < 80X THEN
out[j] := ch; INC(j)
ELSIF ch < 0E0X THEN
val := ORD(ch) - 192;
IF val < 0 THEN FormatError; RETURN END ;
ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
out[j] := CHR(val); INC(j)
ELSIF ch < 0F0X THEN
val := ORD(ch) - 224;
IF val < 0 THEN FormatError; RETURN END ;
ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
out[j] := CHR(val); INC(j)
ELSE
FormatError; RETURN
END ;
ch := in[i]; INC(i)
END;
out[j] := 0X;
IF ch = 0X THEN res := 0 (*ok*) ELSE res := 1 (*truncated*) END
END Utf8ToString;
The version by Luowy is based on Josef's version.
Code: Select all
PROCEDURE Utf8ToString* (IN in: ARRAY OF SHORTCHAR; OUT out: ARRAY OF CHAR; OUT res: INTEGER);
VAR i, j, val, max: INTEGER; ch, ch0: SHORTCHAR;
BEGIN
ch := in[0]; i := 1; j := 0; max := LEN(out) - 1;
WHILE (ch # 0X) & (j < max) DO
IF ch < 80X THEN (*1 byte 00-7F *)
out[j] := ch; INC(j)
ELSIF ch < 0E0X THEN (* 2 bytes C2-DF UTF8Tail *)
val := ORD(ch) - 192; IF val < 2 (*0*) THEN out := ""; res := 2 RETURN END;
ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
val := val * 64 + ORD(ch) - 128;
out[j] := CHR(val); INC(j)
ELSIF ch < 0F0X THEN (* 3 bytes 1110xxxx 10xxxxxx 10xxxxxx *)
val := ORD(ch) - 224; ch0 := ch; ch := in[i]; INC(i);
IF (ch0 = 0E0X)&(ch >= 0A0X)&(ch <= 0BFX) OR (ch0 = 0EDX)& (ch >= 80X)&(ch <= 9FX)
OR (ch0#0E0X)&(ch0#0EDX)&(ch >= 80X)&(ch <= 0BFX) THEN val := val * 64 + ORD(ch) - 128;
ELSE out := ""; res := 2 RETURN
END;
ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
val := val * 64 + ORD(ch) - 128;
out[j] := CHR(val); INC(j)
ELSE(* 4 bytes *)
out := ""; res := 2 RETURN
END;
ch := in[i]; INC(i)
END;
out[j] := 0X;
IF ch = 0X THEN res := 0 (*ok*) ELSE res := 1 (*truncated*) END
END Utf8ToString;
The main difference is that Luowy's version check format of input
according Unicode 7.0 standard and return
res = 2 if found that input contains bad-formed UTF8. Also Luowy's version works
a bit faster and do not return string copy (out := in$) when conversion fails.
Both versions does't support 4-bytes Utf-8 used for: musical symbols, rare Chinese characters, extinct forms of writing, 00110000 - 001FFFFF not used to Unicode.
One comment about "bad format" 3-bytes utf8:
- 1st illegal chars sequence is
0E0X + 080X-09FX + 080X-0BFX.
This sequence never can be the result of
StringToUtf8 converter. If this code in input, there are security risks.
- 2nd illegal chars sequence is
0EDX + 0A0X-0BFX + 080X-0BFX
This sequence code "private" (UCS-2) or surrogates (UTF-16) chars U+D800…U+DBFF, that should not be converted to
String.
Actually we do not now which Unicode encoding assumed in BlackBox, but it should be UCS-2 or UTF-16.