#!/usr/bin/perl sub load { local $/; local *FILE; open FILE, shift; return ; } @punct = qw( + - * / % & | ^ ? : ! ( ) [ ] < > = { } ~ ' " ; . , == && || ** -> ++ -- += -= *= /= => >= <= ); @keywords = qw( if else for while do return int double char ); $PUNCT = 1; $PART_PUNCT = 2; %punct = (); for (@punct) { my $punct = $_; $punct{$punct} |= $PUNCT; for (;;) { substr $punct, -1, 1, ''; last if $punct eq ''; $punct{$punct} |= $PART_PUNCT; } } %keywords = (); for (@keywords) { $keywords{$_} |= 1; } sub parse { my $text = shift; my @tokens = (); for ($text) { redo if (/\G([\s\n]+)/gc); # whitespace if (/\G([\w]+)/gc) { # word push @tokens, $1; } elsif (/\G([^\s\nA-Za-z0-9_]+)/gc) { # punctuation my $symbols = $1; my $symbol = ''; { $symbol .= substr $symbols, 0, 1, ''; my $p = $punct{$symbol}; redo if $p & $PART_PUNCT and $symbols ne ''; $symbols .= substr $symbol, -1, 1, '' unless ($p); if ($punct{$symbol}) { push @tokens, $symbol; $symbol = ''; redo if $symbols ne ''; } else { die 'bad text'; } } } elsif (/\G$/gc) { last; } else { die 'broken parser!'; } redo; } @tokens; } $| = 1; use Data::Dumper; use Benchmark; timethis(100, q{$tokens = [parse 'if (a <= 100) { ++a; }']}); print Dumper $tokens; #load 'play.pl';