sorry,it has a bug,here is the fixup:Ivan Denisov wrote:Luowy, your new version based on Josef's also fail to detect bad-format string: 0EDX 0A1X 8CX 0EDX 0BEX 0B4X 0X
Code: Select all
PROCEDURE Utf8ToString* (IN in: ARRAY OF SHORTCHAR; OUT out: ARRAY OF CHAR; OUT res: INTEGER);
VAR i, j, val, max: INTEGER; ch, ch0: SHORTCHAR;
BEGIN
ch := in[0]; i := 1; j := 0; max := LEN(out) - 1;
WHILE (ch # 0X) & (j < max) DO
IF ch < 80X THEN (*1 byte 00-7F *)
out[j] := ch; INC(j)
ELSIF ch < 0E0X THEN (* 2 bytes C2-DF UTF8Tail *)
val := ORD(ch) - 192; IF val < 2 (*0*) THEN out := ""; res := 2 RETURN END;
ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
val := val * 64 + ORD(ch) - 128;
out[j] := CHR(val); INC(j)
ELSIF ch < 0F0X THEN (* 3 bytes 1110xxxx 10xxxxxx 10xxxxxx *)
val := ORD(ch) - 224; ch0 := ch; ch := in[i]; INC(i);
IF (ch0 = 0E0X)&(ch >= 0A0X)&(ch <= 0BFX) OR (ch0 = 0EDX)& (ch >= 80X)&(ch <= 9FX)
OR (ch0#0E0X)&(ch0#0EDX)&(ch >= 80X)&(ch <= 0BFX) THEN val := val * 64 + ORD(ch) - 128;
ELSE out := ""; res := 2 RETURN
END;
ch := in[i]; INC(i); IF (ch < 80X) OR (ch >= 0E0X) THEN out := ""; res := 2 RETURN END;
val := val * 64 + ORD(ch) - 128;
out[j] := CHR(val); INC(j)
ELSE(* 4 bytes *)
out := ""; res := 2 RETURN
END;
ch := in[i]; INC(i)
END;
out[j] := 0X;
IF ch = 0X THEN res := 0 (*ok*) ELSE res := 1 (*truncated*) END
END Utf8ToString;
I prefer this version. just now we no need to use the char beyond 4bytes in identifier,this version is good enough for kernel.DGDanforth wrote: Are we agreed that the choice is between luowys (which version?) and Josef's?
the full version can be improved like python style(has multi mode in one function) for string lib use.that is another subject.
luowy