From 3cd4f1d15146c08f05206d6328ecbc1c7fdc8dfa Mon Sep 17 00:00:00 2001 From: Charles Forsyth Date: Mon, 30 Jun 2014 10:57:08 +0100 Subject: update to match lib9 --- libkern/rune.c | 195 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 99 insertions(+), 96 deletions(-) (limited to 'libkern/rune.c') diff --git a/libkern/rune.c b/libkern/rune.c index d3cb28e0..f969a2e6 100644 --- a/libkern/rune.c +++ b/libkern/rune.c @@ -1,25 +1,23 @@ #include "lib9.h" +#define Bit(i) (7-(i)) +/* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ +#define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) +/* 0000 0000 0000 0111 1111 1111 */ +#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) + enum { - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, + Bitx = Bit(1), - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + Tx = T(1), /* 1000 0000 */ + Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Maskx = (1< T1 + * 00080-007FF => T2 Tx + * 00800-0FFFF => T3 Tx Tx + * 10000-10FFFF => T4 Tx Tx Tx */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; + c[0] = *(uchar*)(str); + if(c[0] < Tx){ + *rune = c[0]; + return 1; } + l = c[0]; - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) + for(i = 1; i < UTFmax; i++) { + c[i] = *(uchar*)(str+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; - *rune = l; - return 3; + l = (l << Bitx) | c[i]; + if(c[0] < T(i + 2)) { + l &= RuneX(i + 1); + if(i == 1) { + if(c[0] < T(2) || l <= Rune1) + goto bad; + } else if(l <= RuneX(i) || l > Runemax) + goto bad; + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + *rune = l; + return i + 1; + } } /* @@ -83,12 +74,9 @@ bad: int runetochar(char *str, Rune *rune) { - long c; + int i, j; + Rune c; - /* - * one character sequence - * 00000-0007F => 00-7F - */ c = *rune; if(c <= Rune1) { str[0] = c; @@ -96,67 +84,82 @@ runetochar(char *str, Rune *rune) } /* + * one character sequence + * 00000-0007F => 00-7F * two character sequence * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - - /* * three character sequence * 0800-FFFF => T3 Tx Tx + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + * If the Rune is out of range or a surrogate half, + * convert it to the error rune. + * Do this test when i==3 because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > Runemax) + c = Runeerror; + if(SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; + } + if (c <= RuneX(i) || i == UTFmax ) { + str[0] = T(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + return UTFmax; } int runelen(long c) { - if(c <= Rune1) - return 1; - if(c <= Rune2) - return 2; - return 3; + Rune rune; + char str[10]; + + rune = c; + return runetochar(str, &rune); } int -runenlen(Rune *r, int l) +runenlen(Rune *r, int nrune) { - int n; - long c; + int nb, i; + Rune c; - n = 0; - while(l--) { + nb = 0; + while(nrune--) { c = *r++; - if(c <= Rune1) - n += 1; - else - if(c <= Rune2) - n += 2; - else - n += 3; + if(c <= Rune1){ + nb++; + } else { + for(i = 2; i < UTFmax + 1; i++) + if(c <= RuneX(i) || i == UTFmax){ + nb += i; + break; + } + } } - return n; + return nb; } int fullrune(char *str, int n) { - int c; - - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + int i; + Rune c; + + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + for(i = 3; i < UTFmax + 1; i++) + if(c < T(i)) + return n >= i - 1; + return n >= UTFmax; } -- cgit v1.2.3