summaryrefslogtreecommitdiff
path: root/appl/lib/convcs/euc-jp_btos.b
diff options
context:
space:
mode:
Diffstat (limited to 'appl/lib/convcs/euc-jp_btos.b')
-rw-r--r--appl/lib/convcs/euc-jp_btos.b162
1 files changed, 162 insertions, 0 deletions
diff --git a/appl/lib/convcs/euc-jp_btos.b b/appl/lib/convcs/euc-jp_btos.b
new file mode 100644
index 00000000..5a5aa9ec
--- /dev/null
+++ b/appl/lib/convcs/euc-jp_btos.b
@@ -0,0 +1,162 @@
+implement Btos;
+
+# EUC-JP is based on ISO2022 but only uses the 8 bit stateless encoding.
+# Thus, only the following ISO2022 shift functions are used:
+# SINGLE-SHIFT TWO
+# SINGLE-SHIFT THREE
+#
+# The initial state is G0 mapped into GL and G1 mapped into GR
+# SINGLE-SHIFT TWO maps G2 into GR for one code-point encoding
+# SINGLE-SHIFT THREE maps G3 into GR for one code-point encoding
+#
+# EUC-JP has pre-assigned code elements (G0..G3) that are never re-assigned
+# by means on ISO2022 code-identification functions (escape sequences)
+#
+# G0 = ASCII
+# G1 = JIS X 0208
+# G2 = JIS X 0201 Kana
+# G3 = JIS X 0212
+
+include "sys.m";
+include "convcs.m";
+
+sys : Sys;
+
+SS2 : con 16r8E; # ISO2022 SINGLE-SHIFT TWO
+SS3 : con 16r8F; # ISO2022 SINGLE-SHIFT THREE
+
+MAXINT : con 16r7fffffff;
+BADCHAR : con 16rFFFD;
+
+G1PATH : con "/lib/convcs/jisx0208-1997";
+G2PATH : con "/lib/convcs/jisx0201kana";
+G3PATH : con "/lib/convcs/jisx0212";
+
+g1map : string;
+g2map : string;
+g3map : string;
+
+G1PAGESZ : con 94;
+G1NPAGES : con 84;
+G1PAGE0 : con 16rA1;
+G1CHAR0 : con 16rA1;
+
+G2PAGESZ : con 63;
+G2NPAGES : con 1;
+G2CHAR0 : con 16rA1;
+
+G3PAGESZ : con 94;
+G3NPAGES : con 77;
+G3PAGE0 : con 16rA1;
+G3CHAR0 : con 16rA1;
+
+init(nil : string) : string
+{
+ sys = load Sys Sys->PATH;
+
+ error := "";
+ (error, g1map) = getmap(G1PATH, G1PAGESZ, G1NPAGES);
+ if (error != nil)
+ return error;
+ (error, g2map) = getmap(G2PATH, G2PAGESZ, G2NPAGES);
+ if (error != nil)
+ return error;
+ (error, g3map) = getmap(G3PATH, G3PAGESZ, G3NPAGES);
+ return error;
+}
+
+getmap(path : string, pgsz, npgs : int) : (string, string)
+{
+ fd := sys->open(path, Sys->OREAD);
+ if (fd == nil)
+ return (sys->sprint("%s: %r", path), nil);
+
+ buf := array[(pgsz * npgs) * Sys->UTFmax] of byte;
+ nread := 0;
+ for (;nread < len buf;) {
+ n := sys->read(fd, buf[nread:], Sys->ATOMICIO);
+ if (n <= 0)
+ break;
+ nread += n;
+ }
+ map := string buf[:nread];
+ if (len map != (pgsz * npgs))
+ return (sys->sprint("%s: bad data", path), nil);
+ return (nil, map);
+}
+
+btos(nil : Convcs->State, b : array of byte, n : int) : (Convcs->State, string, int)
+{
+ nbytes := 0;
+ str := "";
+
+ if (n == -1)
+ n = MAXINT;
+
+ codelen := 1;
+ codeix := 0;
+ G0, G1, G2, G3 : con iota;
+ state := G0;
+ bytes := array [3] of int;
+
+ while (len str < n) {
+ for (i := nbytes + codeix; i < len b && codeix < codelen; i++)
+ bytes[codeix++]= int b[i];
+
+ if (codeix != codelen)
+ break;
+
+ case state {
+ G0 =>
+ case bytes[0] {
+ 0 to 16r7f =>
+ str[len str] = bytes[0];
+ G1PAGE0 to G1PAGE0+G1NPAGES =>
+ state = G1;
+ codelen = 2;
+ continue;
+ SS2 =>
+ state = G2;
+ codelen = 2;
+ continue;
+ SS3 =>
+ state = G3;
+ codelen = 3;
+ continue;
+ * =>
+ str[len str] = BADCHAR;
+ }
+ G1 =>
+ # double byte encoding
+ page := bytes[0] - G1PAGE0;
+ char := bytes[1] - G1CHAR0;
+ str[len str] = g1map[(page * G1PAGESZ) + char];
+ G2 =>
+ # single byte encoding (byte 0 == SS2)
+ char := bytes[1] - G2CHAR0;
+ if (char < 0 || char >= len g2map)
+ char = BADCHAR;
+ else
+ char = g2map[char];
+ str[len str] = char;
+ G3 =>
+ # double byte encoding (byte 0 == SS3)
+ page := bytes[1] - G3PAGE0;
+ char := bytes[2] - G3CHAR0;
+ if (page < 0 || page >= G3NPAGES) {
+ # first byte is wrong - backup
+ i--;
+ str[len str] = BADCHAR;
+ } else if (char >= G3PAGESZ)
+ str[len str] = BADCHAR;
+ else
+ str[len str] = g3map[(page * G3PAGESZ)+char];
+ }
+
+ state = G0;
+ nbytes = i;
+ codelen = 1;
+ codeix = 0;
+ }
+ return (nil, str, nbytes);
+} \ No newline at end of file