diff options
| author | Charles Forsyth <charles.forsyth@gmail.com> | 2013-06-06 21:01:19 +0000 |
|---|---|---|
| committer | Charles Forsyth <charles.forsyth@gmail.com> | 2013-06-06 21:01:19 +0000 |
| commit | 7ded4a527bdfd0e8b3a9049955f2af89e5f039ee (patch) | |
| tree | 7ee0f03dd4e5cad072bc0c816468ebfd0dd17c67 /lib9/rune.c | |
| parent | 8bdf904b6a23b93336ae2837772110bcdad234ce (diff) | |
sync with Plan 9
Diffstat (limited to 'lib9/rune.c')
| -rw-r--r-- | lib9/rune.c | 205 |
1 files changed, 104 insertions, 101 deletions
diff --git a/lib9/rune.c b/lib9/rune.c index 3dcff0b4..f969a2e6 100644 --- a/lib9/rune.c +++ b/lib9/rune.c @@ -1,75 +1,66 @@ #include "lib9.h" +#define Bit(i) (7-(i)) +/* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ +#define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) +/* 0000 0000 0000 0111 1111 1111 */ +#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) + enum { - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - - Maskx = (1<<Bitx)-1, /* 0011 1111 */ - Testx = Maskx ^ 0xFF, /* 1100 0000 */ - - Bad = Runeerror + Bitx = Bit(1), + + Tx = T(1), /* 1000 0000 */ + Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + SurrogateMin = 0xD800, + SurrogateMax = 0xDFFF, + + Bad = Runeerror, }; int chartorune(Rune *rune, char *str) { - int c, c1, c2; - long l; + int c[UTFmax], i; + Rune l; /* - * one character sequence + * N character sequence * 00000-0007F => T1 + * 00080-007FF => T2 Tx + * 00800-0FFFF => T3 Tx Tx + * 10000-10FFFF => T4 Tx Tx Tx */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; + + c[0] = *(uchar*)(str); + if(c[0] < Tx){ + *rune = c[0]; return 1; } + l = c[0]; - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) + for(i = 1; i < UTFmax; i++) { + c[i] = *(uchar*)(str+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; - *rune = l; - return 2; - } - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; + l = (l << Bitx) | c[i]; + if(c[0] < T(i + 2)) { + l &= RuneX(i + 1); + if(i == 1) { + if(c[0] < T(2) || l <= Rune1) + goto bad; + } else if(l <= RuneX(i) || l > Runemax) + goto bad; + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + *rune = l; + return i + 1; + } } /* @@ -83,12 +74,9 @@ bad: int runetochar(char *str, Rune *rune) { - long c; + int i, j; + Rune c; - /* - * one character sequence - * 00000-0007F => 00-7F - */ c = *rune; if(c <= Rune1) { str[0] = c; @@ -96,67 +84,82 @@ runetochar(char *str, Rune *rune) } /* + * one character sequence + * 00000-0007F => 00-7F * two character sequence * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - - /* * three character sequence * 0800-FFFF => T3 Tx Tx + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + * If the Rune is out of range or a surrogate half, + * convert it to the error rune. + * Do this test when i==3 because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > Runemax) + c = Runeerror; + if(SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; + } + if (c <= RuneX(i) || i == UTFmax ) { + str[0] = T(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + return UTFmax; } int runelen(long c) { - if(c <= Rune1) - return 1; - if(c <= Rune2) - return 2; - return 3; + Rune rune; + char str[10]; + + rune = c; + return runetochar(str, &rune); } int -runenlen(Rune *r, int l) +runenlen(Rune *r, int nrune) { - int n; - long c; + int nb, i; + Rune c; - n = 0; - while(l--) { + nb = 0; + while(nrune--) { c = *r++; - if(c <= Rune1) - n += 1; - else - if(c <= Rune2) - n += 2; - else - n += 3; + if(c <= Rune1){ + nb++; + } else { + for(i = 2; i < UTFmax + 1; i++) + if(c <= RuneX(i) || i == UTFmax){ + nb += i; + break; + } + } } - return n; + return nb; } int fullrune(char *str, int n) { - int c; - - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + int i; + Rune c; + + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + for(i = 3; i < UTFmax + 1; i++) + if(c < T(i)) + return n >= i - 1; + return n >= UTFmax; } |
