"Unicode for Component Pascal identifiers" is mainly Helmut's solution.
We Just choosing between realization of 
Utf8ToString converter in 
Kernel.
The version by Josef is based on Helmut's version:
Code: Select all
PROCEDURE Utf8ToString* (IN in : ARRAY OF SHORTCHAR; OUT out : ARRAY OF CHAR; OUT res: INTEGER);
  VAR i, j, val, max : INTEGER; ch : SHORTCHAR;
  
  PROCEDURE FormatError();
  BEGIN out := in$; res := 2 (*format error*)
  END FormatError;
  
	BEGIN
   ch := in[0]; i := 1; j := 0; max := LEN(out) - 1;
   WHILE (ch # 0X) & (j < max) DO
     IF ch < 80X THEN
       out[j] := ch; INC(j)
     ELSIF ch < 0E0X THEN
       val := ORD(ch) - 192;
       IF val < 0 THEN FormatError; RETURN END ;
       ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
       IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
       out[j] := CHR(val); INC(j)
     ELSIF ch < 0F0X THEN 
       val := ORD(ch) - 224;
       IF val < 0 THEN FormatError; RETURN END ;
       ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
       IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
       ch := in[i]; INC(i); val := val * 64 + ORD(ch) - 128;
       IF (ch < 80X) OR (ch >= 0E0X) THEN FormatError; RETURN END ;
       out[j] := CHR(val); INC(j)
     ELSE
       FormatError; RETURN
     END ;
     ch := in[i]; INC(i)
   END;
   out[j] := 0X;
   IF ch = 0X THEN res := 0 (*ok*) ELSE res := 1 (*truncated*) END
END Utf8ToString;
The version by Luowy is based on Josef's version.
Code: Select all
  PROCEDURE Utf8ToString* (IN in: ARRAY OF SHORTCHAR; OUT out: ARRAY OF CHAR; OUT res: INTEGER);
      VAR i, j, val, max: INTEGER; ch, ch0: SHORTCHAR;
   BEGIN
      ch := in[0]; i := 1; j := 0; max := LEN(out) - 1;
      WHILE (ch # 0X) & (j < max) DO
         IF ch < 80X THEN           (*1 byte   00-7F *)
            out[j] := ch; INC(j)
         ELSIF ch < 0E0X THEN  (* 2 bytes  C2-DF UTF8Tail *)
            val := ORD(ch) - 192; IF val < 2 (*0*) THEN out := ""; res := 2 RETURN END; 
            ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
            val := val * 64 + ORD(ch) - 128;
            out[j] := CHR(val); INC(j)
         ELSIF ch < 0F0X THEN  (* 3 bytes 1110xxxx 10xxxxxx 10xxxxxx *)
            val := ORD(ch) - 224; ch0 := ch; ch := in[i]; INC(i); 
            IF (ch0 = 0E0X)&(ch >= 0A0X)&(ch <= 0BFX) OR (ch0 = 0EDX)& (ch >= 80X)&(ch <= 9FX) 
               OR (ch0#0E0X)&(ch0#0EDX)&(ch >= 80X)&(ch <= 0BFX) THEN val := val * 64 + ORD(ch) - 128;
            ELSE out := ""; res := 2 RETURN 
            END;
            ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
            val := val * 64 + ORD(ch) - 128;
            out[j] := CHR(val); INC(j)
         ELSE(* 4 bytes *)
            out := ""; res := 2 RETURN
         END;
         ch := in[i]; INC(i)
      END;
      out[j] := 0X;
      IF ch = 0X THEN res := 0 (*ok*) ELSE res := 1 (*truncated*) END
   END Utf8ToString;
The main difference is that Luowy's version check format of input 
according Unicode 7.0 standard and return 
res = 2 if found that input contains bad-formed UTF8. Also Luowy's version works 
a bit faster and do not return string copy (out := in$) when conversion fails.
Both versions does't support 4-bytes Utf-8 used for: musical symbols, rare Chinese characters, extinct forms of writing, 00110000 - 001FFFFF not used to Unicode.
One comment about "bad format" 3-bytes utf8:
- 1st illegal chars sequence is 
0E0X + 080X-09FX + 080X-0BFX.
This sequence never can be the result of 
StringToUtf8 converter. If this code in input, there are security risks.
 - 2nd illegal chars sequence is 
0EDX + 0A0X-0BFX + 080X-0BFX
This sequence code "private" (UCS-2) or surrogates (UTF-16) chars U+D800…U+DBFF, that should not be converted to 
String.
Actually we do not now which Unicode encoding assumed in BlackBox, but it should be UCS-2 or UTF-16.