summaryrefslogtreecommitdiff
path: root/appl/lib/convcs/cp932_btos.b
diff options
context:
space:
mode:
Diffstat (limited to 'appl/lib/convcs/cp932_btos.b')
-rw-r--r--appl/lib/convcs/cp932_btos.b179
1 files changed, 179 insertions, 0 deletions
diff --git a/appl/lib/convcs/cp932_btos.b b/appl/lib/convcs/cp932_btos.b
new file mode 100644
index 00000000..a99d87f5
--- /dev/null
+++ b/appl/lib/convcs/cp932_btos.b
@@ -0,0 +1,179 @@
+implement Btos;
+
+# encoding details
+# (Traditional) Shift-JIS
+#
+# 00..1f control characters
+# 20 space
+# 21..7f JIS X 0201:1976/1997 roman (see notes)
+# 80 undefined
+# 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
+# a0 undefined
+# a1..df JIS X 0201:1976/1997 katakana
+# e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
+# eb..ff undefined
+#
+# CP932 (windows-31J)
+#
+# this encoding scheme extends Shift-JIS in the following way
+#
+# eb..ec undefined (marked as lead bytes - see notes below)
+# ed..ee lead byte of NEC-selected IBM extended characters
+# ef undefined (marked as lead byte - see notes below)
+# f0..f9 lead byte of User defined GAIJI (see note below)
+# fa..fc lead byte of IBM extended characters
+# fd..ff undefined
+#
+#
+# Notes
+#
+# JISX 0201:1976/1997 roman
+# this is the same as ASCII but with 0x5c (ASCII code for '\')
+# representing the Yen currency symbol '¥' (U+00a5)
+# This mapping is contentious, some conversion packages implent it
+# others do not.
+# The mapping files from The Unicode Consortium show cp932 mapping
+# plain ascii in the range 00..7f whereas shift-jis maps 16r5c ('\') to the yen
+# symbol (¥) and 16r7e ('~') to overline (¯)
+#
+# CP932 double-byte character codes:
+#
+# eb-ec, ef, f0-f9:
+# Marked as DBCS LEAD BYTEs in the unicode mapping data
+# obtained from:
+# https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
+#
+# but there are no defined mappings for codes in this range.
+# It is not clear whether or not an implementation should
+# consume one or two bytes before emitting an error char.
+#
+
+include "sys.m";
+include "convcs.m";
+
+sys : Sys;
+
+MAXINT : con 16r7fffffff;
+BADCHAR : con 16rFFFD;
+
+KANAPAGES : con 1;
+KANAPAGESZ : con 63;
+KANACHAR0 : con 16ra1;
+
+CP932PAGES : con 45; # 81..84, 87..9f, e0..ea, ed..ee, fa..fc
+CP932PAGESZ : con 189; # 40..fc (including 7f)
+CP932CHAR0 : con 16r40;
+
+
+shiftjis := 0;
+page0 := array [256] of { * => BADCHAR };
+cp932 : string;
+dbcsoff := array [256] of { * => -1 };
+
+init(arg : string) : string
+{
+ sys = load Sys Sys->PATH;
+ shiftjis = arg == "shiftjis";
+
+ (error, kana) := getmap("/lib/convcs/jisx0201kana", KANAPAGESZ, KANAPAGES);
+ if (error != nil)
+ return error;
+
+ (error, cp932) = getmap("/lib/convcs/cp932", CP932PAGESZ, CP932PAGES);
+ if (error != nil)
+ return error;
+
+ # jisx0201kana is mapped into 16rA1..16rDF
+ for (i := 0; i < KANAPAGESZ; i++)
+ page0[i + KANACHAR0] = kana[i];
+
+ # 00..7f same as ascii in cp932
+ for (i = 0; i <= 16r7f; i++)
+ page0[i] = i;
+ if (shiftjis) {
+ # shift-jis uses JIS X 0201 for the ASCII range
+ # this is the same as ASCII apart from
+ # 16r5c ('\') maps to yen symbol (¥) and 16r7e ('~') maps to overline (¯)
+ page0['\\'] = '¥';
+ page0['~'] = '¯';
+ }
+
+ # pre-calculate DBCS page numbers to mapping file page numbers
+ # and mark codes in page0 that are DBCS lead bytes
+ pnum := 0;
+ for (i = 16r81; i <= 16r84; i++){
+ page0[i] = -1;
+ dbcsoff[i] = pnum++;
+ }
+ for (i = 16r87; i <= 16r9f; i++){
+ page0[i] = -1;
+ dbcsoff[i] = pnum++;
+ }
+ for (i = 16re0; i <= 16rea; i++) {
+ page0[i] = -1;
+ dbcsoff[i] = pnum++;
+ }
+ if (!shiftjis) {
+ # add in cp932 extensions
+ for (i = 16red; i <= 16ree; i++) {
+ page0[i] = -1;
+ dbcsoff[i] = pnum++;
+ }
+ for (i = 16rfa; i <= 16rfc; i++) {
+ page0[i] = -1;
+ dbcsoff[i] = pnum++;
+ }
+ }
+ return nil;
+}
+
+btos(nil : Convcs->State, b : array of byte, n : int) : (Convcs->State, string, int)
+{
+ nbytes := 0;
+ str := "";
+
+ if (n == -1)
+ n = MAXINT;
+
+ for (i := 0; i < len b && len str < n; i++) {
+ b1 := int b[i];
+ ch := page0[b1];
+ if (ch != -1) {
+ str[len str] = ch;
+ nbytes++;
+ continue;
+ }
+ # DBCS
+ i++;
+ if (i >= len b)
+ break;
+ pnum := dbcsoff[b1];
+ ix := (int b[i]) - CP932CHAR0;
+ if (pnum == -1 || ix < 0 || ix >= CP932PAGESZ)
+ str[len str] = BADCHAR;
+ else
+ str[len str] = cp932[(pnum * CP932PAGESZ)+ix];
+ nbytes += 2;
+ }
+ return (nil, str, nbytes);
+}
+
+getmap(path : string, pgsz, npgs : int) : (string, string)
+{
+ fd := sys->open(path, Sys->OREAD);
+ if (fd == nil)
+ return (sys->sprint("%s: %r", path), nil);
+
+ buf := array[(pgsz * npgs) * Sys->UTFmax] of byte;
+ nread := 0;
+ for (;nread < len buf;) {
+ n := sys->read(fd, buf[nread:], Sys->ATOMICIO);
+ if (n <= 0)
+ break;
+ nread += n;
+ }
+ map := string buf[:nread];
+ if (len map != (pgsz * npgs))
+ return (sys->sprint("%s: bad data", path), nil);
+ return (nil, map);
+}