From a40ef1434889babbd88c9d0c5913c70e96ac2774 Mon Sep 17 00:00:00 2001 From: "Konstantin Kirik (snegovick)" Date: Fri, 12 Dec 2025 03:51:07 +0300 Subject: Add simple parser for future shell-like lang --- mkfile | 35 ++++++++++ sh92.b | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ sh9parser.b | 130 +++++++++++++++++++++++++++++++++++++ sh9parser.m | 28 ++++++++ sh9util.b | 26 ++++++++ sh9util.m | 8 +++ 6 files changed, 435 insertions(+) create mode 100644 mkfile create mode 100644 sh92.b create mode 100644 sh9parser.b create mode 100644 sh9parser.m create mode 100644 sh9util.b create mode 100644 sh9util.m diff --git a/mkfile b/mkfile new file mode 100644 index 0000000..cf68f33 --- /dev/null +++ b/mkfile @@ -0,0 +1,35 @@ +<../../../mkconfig + +TARG=sh92.dis\ + sh9util.dis\ + sh9parser.dis\ + +INS= $ROOT/dis/sh92.dis\ + $ROOT/dis/sh9/sh9util.dis\ + $ROOT/dis/sh9/sh9parser.dis\ + +SYSMODULES=\ + sys.m\ + +DISBIN=$ROOT/dis/sh9 + +<$ROOT/mkfiles/mkdis + +all:V: $TARG + +install:V: $INS + cp $DISBIN/sh92.dis $DISBIN/.. + +nuke:V: clean + rm -f $INS + +clean:V: + rm -f *.dis *.sbl + +uninstall:V: + rm -f $INS + +$ROOT/dis/sh92.dis: sh92.dis + mkdir $DISBIN/ && rm -f $ROOT/dis/sh92.dis && cp sh92.dis $ROOT/dis/sh92.dis + +%.dis: ${SYSMODULES:%=$MODDIR/%} diff --git a/sh92.b b/sh92.b new file mode 100644 index 0000000..bce58c3 --- /dev/null +++ b/sh92.b @@ -0,0 +1,208 @@ +implement Sh92; + +include "sys.m"; +include "draw.m"; +include "sh9util.m"; +include "sh9parser.m"; + +sys: Sys; +sh9u: Sh9Util; +sh9p: Sh9Parser; + +Sh92: module { + init: fn(nil: ref Draw->Context, nil: list of string); +}; + +ModProc: adt { + name: string; + start: int; +}; + +ModVar: adt { + name: string; + val: string; +}; + +ShModule: adt { + global_vars: list of ref ModVar; + procs: list of ref ModProc; +}; + +GrammarNode: import sh9p; +TokNode: import sh9p; +mk_tok: import sh9p; +set_last_tok: import sh9p; +print_toks: import sh9p; +parse_toks: import sh9p; + +reverse_list: import sh9u; +to_array: import sh9u; + +S_UNKNOWN: con "UNK"; +S_ID: con "ID"; +S_STR: con "STR"; +S_EQ: con "EQ"; +S_DOL: con "DOL"; +S_COLON: con "COLON"; +S_SEMIC: con "SEMIC"; +S_LPAR: con "LPAR"; +S_RPAR: con "RPAR"; +S_LCURLY: con "LCURLY"; +S_RCURLY: con "RCURLY"; +S_DQSTR: con "DQSTR"; +S_SQSTR: con "SQSTR"; +S_DQTE: con "DQTE"; +S_SQTE: con "SQTE"; +S_SP: con "SP"; +S_TAB: con "TAB"; +S_EOL: con "EOL"; + +S_STMT: con "STMT"; +S_EXPR: con "EXPR"; +S_CALL: con "CALL"; + +tokenize(line: string, line_n: int): array of ref TokNode { + toks : list of ref TokNode; + last_tok: TokNode; + last_tok.start = -1; + last_tok.line = -1; + last_tok.tok = ""; + last_tok.typ = S_UNKNOWN; + + for (i := 0; i < len line; i++) { + if (last_tok.typ == S_DQSTR) { + case (line[i:i+1]) { + "\"" => { + l := len last_tok.tok; + if ((last_tok.tok[l-1:] != "\\") || ((last_tok.tok[l-1:] == "\\") && (last_tok.tok[l-2:l-1] == "\\"))) { + # end of str + last_tok.tok = last_tok.tok + line[i:i+1]; + (last_tok, toks) = set_last_tok(ref last_tok, toks); + } else { + # escaped dqte, just continue + last_tok.tok = last_tok.tok + line[i:i+1]; + } + }; + * => { + last_tok.tok = last_tok.tok + line[i:i+1]; + } + } + } else if (last_tok.typ == S_SQSTR) { + case (line[i:i+1]) { + "'" => { + l := len last_tok.tok; + if ((last_tok.tok[l-1:] != "\\") || ((last_tok.tok[l-1:] == "\\") && (last_tok.tok[l-2:l-1] == "\\"))) { + # end of str + last_tok.tok = last_tok.tok + line[i:i+1]; + (last_tok, toks) = set_last_tok(ref last_tok, toks); + } else { + # escaped sqte, just continue + last_tok.tok = last_tok.tok + line[i:i+1]; + } + }; + * => { + last_tok.tok = last_tok.tok + line[i:i+1]; + } + } + } else { + case (line[i:i+1]) { + " " or "\t" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + }; + "=" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "=", S_EQ) :: toks; + }; + ";" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, ";", S_SEMIC) :: toks; + }; + "$" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "$", S_DOL) :: toks; + }; + "(" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "(", S_LPAR) :: toks; + }; + ")" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, ")", S_RPAR) :: toks; + }; + "{" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "{", S_LCURLY) :: toks; + }; + "}" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "}", S_RCURLY) :: toks; + }; + "\"" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + last_tok.start = i; + last_tok.line = line_n; + last_tok.typ = S_DQSTR; + last_tok.tok = last_tok.tok + line[i:i+1]; + }; + "'" => { + (last_tok, toks) = set_last_tok(ref last_tok, toks); + last_tok.start = i; + last_tok.line = line_n; + last_tok.typ = S_SQSTR; + last_tok.tok = last_tok.tok + line[i:i+1]; + }; + * => { + if (last_tok.start == -1) { + last_tok.start = i; + last_tok.line = line_n; + last_tok.typ = S_ID; + } + last_tok.tok = last_tok.tok + line[i:i+1]; + }; + } + } + } + (last_tok, toks) = set_last_tok(ref last_tok, toks); + toks = ref mk_tok(i, line_n, "", S_EOL) :: toks; + toks = reverse_list(toks); + return to_array(toks); +} + +stmt_assign(toks: array of ref TokNode) { + sys->print("ASSIGN STMT\n"); +} + +stmt_cmd_call(toks: array of ref TokNode) { + sys->print("CMD CALL\n"); +} + +empty(toks: array of ref TokNode) { + sys->print("EMPTY\n"); +} + +init(ctxt: ref Draw->Context, argv: list of string) { + sys = load Sys Sys->PATH; + sh9u = load Sh9Util Sh9Util->PATH; + sh9p = load Sh9Parser Sh9Parser->PATH; + + assign_g_semic : GrammarNode = (array [] of {S_ID, S_EQ, S_EXPR, S_SEMIC}, S_UNKNOWN, stmt_assign); + assign_g_eol : GrammarNode = (array [] of {S_ID, S_EQ, S_EXPR, S_EOL}, S_UNKNOWN, stmt_assign); + sqstr_expr_g: GrammarNode = (array [] of {S_SQSTR}, S_EXPR, empty); + str_expr_g: GrammarNode = (array [] of {S_STR}, S_EXPR, empty); + cmd_call_g: GrammarNode = (array [] of {S_ID, S_EQ, S_EXPR, S_SEMIC}, S_UNKNOWN, stmt_cmd_call); + grammar: array of ref GrammarNode; + grammar = array [] of {ref assign_g_semic, ref assign_g_eol, ref sqstr_expr_g, ref str_expr_g, ref cmd_call_g}; + + toks1 := tokenize("A = 'smth \"test\" ';", 0); + print_toks(toks1); + sys->print("Parse\n"); + parse_toks(toks1, grammar); + sys->print("Parse done\n"); + + # toks2 := tokenize("echo \"smth \" \"test\";", 0); + # print_toks(toks2); + # toks3 := tokenize("if test x\"a\" = x\"b\"; then echo \"1\"; fi", 0); + # print_toks(toks3); + # toks4 := tokenize("echo 'smth2' 'test';", 0); + # print_toks(toks4); +} diff --git a/sh9parser.b b/sh9parser.b new file mode 100644 index 0000000..fcae119 --- /dev/null +++ b/sh9parser.b @@ -0,0 +1,130 @@ +implement Sh9Parser; + +include "sys.m"; +include "sh9parser.m"; +include "sh9util.m"; + +sys: Sys; +S_UNKNOWN: con "UNK"; + +sh9u: Sh9Util; + +reverse_list: import sh9u; +to_array: import sh9u; + +GrammarNode.print_expr(gn: self ref GrammarNode) { + lg:= len gn.expr; + for (i:=0; iprint("%s ", gn.expr[i]); + } + if (gn.transform == S_UNKNOWN) { + sys->print("\n"); + } else { + sys->print("-> %s\n", gn.transform); + } +} + +mk_tok(start: int, line: int, tok: string, typ: string) : TokNode { + tok_node: TokNode; + tok_node.start = start; + tok_node.line = line; + tok_node.tok = tok; + tok_node.typ = typ; + return tok_node; +} + +set_last_tok(last_tok: ref TokNode, toks: list of ref TokNode): (TokNode, list of ref TokNode) { + ret_tok: TokNode; + ret_tok = *last_tok; + if (last_tok.typ != S_UNKNOWN) { + toks = last_tok :: toks; + ret_tok.typ = S_UNKNOWN; + ret_tok.start = -1; + ret_tok.tok = ""; + ret_tok.line = -1; + } + return (ret_tok, toks); +} + +print_toks(toks: array of ref TokNode) { + lt := len toks; + for (i := 0; i < lt; i ++) { + tok := toks[i]; + sys->print("[%d/%d] %s (%s)\n", i, lt, tok.typ, tok.tok); + } +} + +print_toks_short(toks: array of ref TokNode) { + lt := len toks; + for (i := 0; i < lt; i ++) { + tok := toks[i]; + sys->print("%s ", tok.typ); + } + sys->print("\n"); +} + +check_grammar_node_match(toks: array of ref TokNode, gn: ref GrammarNode): int { + lt:= len toks; + lg:= len gn.expr; + if (lg > lt) { + return 0; + } + #sys->print("Checking grammar "); + gn.print_expr(); + #sys->print("Against "); + print_toks(toks); + for (i:= 0; i < lg; i ++) { + if (toks[i].typ != gn.expr[i]) { + return 0; + } + } + return 1; +} + +replace_toks(src: array of ref TokNode, replace_start: int, replace_len: int, replace_with: array of ref TokNode): array of ref TokNode { + src_len:= len src; + new_toks: list of ref TokNode; + with_len:= len replace_with; + for (i:=0; iprint("Loop %d: ", ctr); + print_toks_short(toks); + ctr ++; + changed = 0; + fast: for (i := 0; i <= lt; i ++) { + for (j := 0; j < lgns; j++) { + gj:= g[j]; + if (check_grammar_node_match(toks[lt - i:], gj) == 1) { + sys->print("Something matched !\n"); + gj.print_expr(); + sys->print("Before replace: "); + print_toks_short(toks); + gj.callback(toks[lt-i: lt-i+len gj.expr]); + toks = replace_toks(toks, lt-i, len gj.expr, array[] of {ref mk_tok(toks[lt - i].start, toks[lt - i].line, "", gj.transform)}); + sys->print("After replace: "); + changed = 1; + break fast; + } + } + } + } while(changed); + return toks; +} diff --git a/sh9parser.m b/sh9parser.m new file mode 100644 index 0000000..cbad2f6 --- /dev/null +++ b/sh9parser.m @@ -0,0 +1,28 @@ +Sh9Parser: module +{ +PATH: con "sh9parser.dis"; +DESCR: con "Mostly generic parser for sh9"; + +mk_tok: fn(start: int, line: int, tok: string, typ: string) : TokNode; +set_last_tok: fn(last_tok: ref TokNode, toks: list of ref TokNode): (TokNode, list of ref TokNode); +print_toks: fn(toks: array of ref TokNode); +print_toks_short: fn(toks: array of ref TokNode); +check_grammar_node_match: fn(toks: array of ref TokNode, gn: ref GrammarNode): int; +replace_toks: fn(src: array of ref TokNode, replace_start: int, replace_len: int, replace_with: array of ref TokNode): array of ref TokNode; +parse_toks: fn(toks: array of ref TokNode, g: array of ref GrammarNode): array of ref TokNode; + +TokNode: adt { + start: int; + line: int; + tok: string; + typ: string; +}; + +GrammarNode: adt { + expr: array of string; + transform: string; + + callback: ref fn(toks: array of ref TokNode); + print_expr: fn(gn: self ref GrammarNode); +}; +}; diff --git a/sh9util.b b/sh9util.b new file mode 100644 index 0000000..f74032e --- /dev/null +++ b/sh9util.b @@ -0,0 +1,26 @@ +implement Sh9Util; + +include "sh9util.m"; + +reverse_list[T](toks: list of T): list of T +{ + lt := len toks; + out : list of T; + for (i := 0; i < lt; i ++) { + tok := hd toks; + #toks = tl toks; + out = tok :: out; + } + return out; +} + +to_array[T](toks: list of T): array of T { + lt := len toks; + out := array[lt] of T; + for (i := 0; i < lt; i ++) { + tok := hd toks; + toks = tl toks; + out[i] = tok; + } + return out; +} diff --git a/sh9util.m b/sh9util.m new file mode 100644 index 0000000..aa44c3f --- /dev/null +++ b/sh9util.m @@ -0,0 +1,8 @@ +Sh9Util: module +{ +PATH: con "sh9util.dis"; +DESCR: con "Utility functions for sh9"; + +reverse_list: fn[T](toks: list of T): list of T; +to_array: fn[T](toks: list of T): array of T; +}; -- cgit v1.2.3