1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
implement Btos;
# encoding details
# (Traditional) Shift-JIS
#
# 00..1f control characters
# 20 space
# 21..7f JIS X 0201:1976/1997 roman (see notes)
# 80 undefined
# 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
# a0 undefined
# a1..df JIS X 0201:1976/1997 katakana
# e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
# eb..ff undefined
#
# CP932 (windows-31J)
#
# this encoding scheme extends Shift-JIS in the following way
#
# eb..ec undefined (marked as lead bytes - see notes below)
# ed..ee lead byte of NEC-selected IBM extended characters
# ef undefined (marked as lead byte - see notes below)
# f0..f9 lead byte of User defined GAIJI (see note below)
# fa..fc lead byte of IBM extended characters
# fd..ff undefined
#
#
# Notes
#
# JISX 0201:1976/1997 roman
# this is the same as ASCII but with 0x5c (ASCII code for '\')
# representing the Yen currency symbol '¥' (U+00a5)
# This mapping is contentious, some conversion packages implent it
# others do not.
# The mapping files from The Unicode Consortium show cp932 mapping
# plain ascii in the range 00..7f whereas shift-jis maps 16r5c ('\') to the yen
# symbol (¥) and 16r7e ('~') to overline (¯)
#
# CP932 double-byte character codes:
#
# eb-ec, ef, f0-f9:
# Marked as DBCS LEAD BYTEs in the unicode mapping data
# obtained from:
# https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
#
# but there are no defined mappings for codes in this range.
# It is not clear whether or not an implementation should
# consume one or two bytes before emitting an error char.
#
include "sys.m";
include "convcs.m";
sys : Sys;
MAXINT : con 16r7fffffff;
BADCHAR : con 16rFFFD;
KANAPAGES : con 1;
KANAPAGESZ : con 63;
KANACHAR0 : con 16ra1;
CP932PAGES : con 45; # 81..84, 87..9f, e0..ea, ed..ee, fa..fc
CP932PAGESZ : con 189; # 40..fc (including 7f)
CP932CHAR0 : con 16r40;
shiftjis := 0;
page0 := array [256] of { * => BADCHAR };
cp932 : string;
dbcsoff := array [256] of { * => -1 };
init(arg : string) : string
{
sys = load Sys Sys->PATH;
shiftjis = arg == "shiftjis";
(error, kana) := getmap("/lib/convcs/jisx0201kana", KANAPAGESZ, KANAPAGES);
if (error != nil)
return error;
(error, cp932) = getmap("/lib/convcs/cp932", CP932PAGESZ, CP932PAGES);
if (error != nil)
return error;
# jisx0201kana is mapped into 16rA1..16rDF
for (i := 0; i < KANAPAGESZ; i++)
page0[i + KANACHAR0] = kana[i];
# 00..7f same as ascii in cp932
for (i = 0; i <= 16r7f; i++)
page0[i] = i;
if (shiftjis) {
# shift-jis uses JIS X 0201 for the ASCII range
# this is the same as ASCII apart from
# 16r5c ('\') maps to yen symbol (¥) and 16r7e ('~') maps to overline (¯)
page0['\\'] = '¥';
page0['~'] = '¯';
}
# pre-calculate DBCS page numbers to mapping file page numbers
# and mark codes in page0 that are DBCS lead bytes
pnum := 0;
for (i = 16r81; i <= 16r84; i++){
page0[i] = -1;
dbcsoff[i] = pnum++;
}
for (i = 16r87; i <= 16r9f; i++){
page0[i] = -1;
dbcsoff[i] = pnum++;
}
for (i = 16re0; i <= 16rea; i++) {
page0[i] = -1;
dbcsoff[i] = pnum++;
}
if (!shiftjis) {
# add in cp932 extensions
for (i = 16red; i <= 16ree; i++) {
page0[i] = -1;
dbcsoff[i] = pnum++;
}
for (i = 16rfa; i <= 16rfc; i++) {
page0[i] = -1;
dbcsoff[i] = pnum++;
}
}
return nil;
}
btos(nil : Convcs->State, b : array of byte, n : int) : (Convcs->State, string, int)
{
nbytes := 0;
str := "";
if (n == -1)
n = MAXINT;
for (i := 0; i < len b && len str < n; i++) {
b1 := int b[i];
ch := page0[b1];
if (ch != -1) {
str[len str] = ch;
nbytes++;
continue;
}
# DBCS
i++;
if (i >= len b)
break;
pnum := dbcsoff[b1];
ix := (int b[i]) - CP932CHAR0;
if (pnum == -1 || ix < 0 || ix >= CP932PAGESZ)
str[len str] = BADCHAR;
else
str[len str] = cp932[(pnum * CP932PAGESZ)+ix];
nbytes += 2;
}
return (nil, str, nbytes);
}
getmap(path : string, pgsz, npgs : int) : (string, string)
{
fd := sys->open(path, Sys->OREAD);
if (fd == nil)
return (sys->sprint("%s: %r", path), nil);
buf := array[(pgsz * npgs) * Sys->UTFmax] of byte;
nread := 0;
for (;nread < len buf;) {
n := sys->read(fd, buf[nread:], Sys->ATOMICIO);
if (n <= 0)
break;
nread += n;
}
map := string buf[:nread];
if (len map != (pgsz * npgs))
return (sys->sprint("%s: bad data", path), nil);
return (nil, map);
}
|