use util error export types #typedef enum { COMMENT, TABS, SPACE, NUMBER, STRING, CHARACTER, NAME, BRACKET, DELIMIT, OP, ILLEGAL } token_types typedef enum { NAME, SPACE, BRACKET, OP, NEWLINE, TABS, DELIMIT, NUMBER, COMMENT, STRING, CHARACTER, BSPACE, TOKEN_TYPE_TOP, ILLEGAL, TOK_EOT } token_t # tried to order with commonest ones first... probably should use stats to do that :p lol # what is QUOTED good for? symbols? like in lisp? code as data? # disabled for now (cause hardish to implement!) boolean tok_initd = 0 typedef char char_table[256] char_table ct_token_type, ct_name2 # TODO more tables for different types of characters? tok_init() if !tok_initd tok_initd = 1 char_table_init(ct_token_type, c, token_type_(c)) char_table_init(ct_name2, c, char_name2_(c)) def char_name2_(c) tween(c, 'A', 'Z') || tween(c, 'a', 'z') || tween(c, '0', '9') || among(c, '_', '$') def token_type(c) ct_token_type[(int)(uchar)c] def char_name2(c) ct_name2[(int)(uchar)(c)] char token_type_(char c) if c == '\0' return TOK_EOT eif c == '#' return COMMENT eif c == '\n' return NEWLINE eif c == '\t' return TABS eif c == ' ' return SPACE eif c == '\b' return BSPACE eif tween(c, '0', '9') return NUMBER # I want to be able to identify a token's type from its first char. # So can't allow .1, you must type 0.1 :/ eif c == '"' return STRING eif c == '\'' return CHARACTER # eif c == '`' # return QUOTED eif tween(c, 'A', 'Z') || tween(c, 'a', 'z') || among(c, '_', '$') return NAME eif among(c, '[', ']', '(', ')', '{', '}') return BRACKET eif among(c, ';', ',') return DELIMIT eif tween(c, 33, 126) return OP else return ILLEGAL def char_table_init(array, var, predicate) . int var for var=0 ; var<256 ; ++var char *p = array + var *p = predicate def tok_EOT '\0' # this can be over-ridden to be \n instead def tok_comment(i) while *i != tok_EOT ++i def tok_tabs(i) while *i == '\t' ++i def tok_space(i) while *i == ' ' ++i def tok_bspace(i) while *i == '\b' ++i def tok_number(i) if *(i-1) != '0' tok_decimal(i) eif *i == 'x' ++i tok_hex(i) eif tween(*i, '0', '7') tok_octal(i) eif !tween(*i, '0', '9') tok_decimal(i) # otherwise - a BAD number, at this tokenizer stage it will be split 08 -> 0 8 for example :p def tok_decimal(i) tok_decimal_int(i) if *i == '.' ++i tok_decimal_int(i) tok_float_exp(i) tok_float_suffix(i) def tok_decimal_int(i) while tween(*i, '0', '9') ++i def tok_float_exp(i) if among(*i, 'e', 'E') ++i if among(*i, '+', '-') ++i tok_decimal_int(i) def tok_float_suffix(i) if among(*i, 'f', 'F', 'l', 'L') ++i def tok_hex(i) while tween(*i, '0', '9') || tween(*i, 'A', 'F') || tween(*i, 'a', 'f') ++i def tok_octal(i) while tween(*i, '0', '7') ++i def tok_string(i) while !among(*i, '"', tok_EOT) if *i == '\\' ++i if *i == tok_EOT break ++i if *i == '"' ++i def tok_char(i) while !among(*i, '\'', tok_EOT) if *i == '\\' ++i if *i == tok_EOT break ++i if *i == '\'' ++i def tok_name(i) while char_name2(*i) ++i def tok_bracket(i) . def tok_delimit(i) . def tok_op(i) while token_type(*i) == OP ++i def tok_illegal(i) while token_type(*i) == ILLEGAL && *i != tok_EOT ++i def bracket_type(c) among(c, '[', '(', '{') ? 1 : -1 token_t token(char **i_ptr) char *i = *i_ptr token_t t = token_type(*i++) which t TOK_EOT . COMMENT tok_comment(i) NEWLINE . TABS tok_tabs(i) SPACE tok_space(i) BSPACE tok_bspace(i) NUMBER tok_number(i) STRING tok_string(i) CHARACTER tok_char(i) NAME tok_name(i) BRACKET tok_bracket(i) DELIMIT tok_delimit(i) OP tok_op(i) ILLEGAL tok_illegal(i) else fault("unknown token type here: %s", *i_ptr) *i_ptr = i return t boolean tok_eq(char *p0, char *p1, cstr s) while p0 != p1 if *p0++ != *s++ return 0 return *s == '\0'