[m-dev.] For review: general purpose lexer module

Ralph Becket rbeck at microsoft.com
Sat Feb 10 02:41:05 AEDT 2001


This is a mostly efficient lexer module I wrote that will handle
arbitrary regular expressions and is easy to use.

It's not clear to me that I've got the interface exactly right and
I have a funny feeling I've misused the term `lexeme'.  Any feedback
would be gratefully received.

I propose that, if accepted, this should go in the standard library
since it is of general utility.

An now, the code...

<<README>>
lex 1.0 (very alpha)
Fri Aug 25 17:54:28  2000
(C) Ralph Becket <rbeck at microsoft.com>



This package defines a lexer for Mercury.  There is plenty of scope
for optimization, however it is reasonably efficient and does provide
the holy grail of piecemeal lexing of stdin (and strings, and lists,
and ...)

The interface is simple.

1. Import modules lex, lex__lexeme, lex__buf, lex__regexp.

    :- import_module lex, lex__lexeme, lex__buf, lex__regexp.

2. Set up a token type.

    :- type token
        --->    comment
        ;       id
        ;       num.

3. Set up a list of lexemes.

    Lexemes = [
        lexeme(noval(comment),      (atom('%') >> star(dot))),
        lexeme(value(id,            identifier)),
        lexeme(ignore,              whitespace)
    ]

noval tokens are simply identified;
value tokens are identified and returned with the string matched;
ignore regexps are simply passed over.

4. Set up a lexer with an appropriate read predicate (see the buf module).

    Lexer = lex__init(Lexemes, lex__buf__read_from_stdin)

5. Obtain a live lexer state.

    State0 = lex__start(Lexer, IO0)

6. Use it to lex the input stream.

    lex__read(Result, State0, State1),
    ( Result = ok(NoValToken), ...
    ; Result = ok(ValueToken, String), ...
    ; Result = error(OffsetInInputStream), ...
    ; Result = eof, ...
    )

7. If you need to manipulate the source object, you can.

    lex__manipulate_source(io__print("not finished yet?"), State1, State2)

8. When you're done, retrieve the source object.

    IO = lex__stop(State)

And that's basically it.

In future I plan to add several optimizations to the implementation
and the option to write out a compilable source file for the lexer.



OPPORTUNITIES FOR OPTIMIZATION

1. Move from chars to bytes.
2. Implement a byte_array rather than using a wasteful array(char) for the
input buffer.
3. Implement the first-byte optimization whereby the set of `live lexemes'
is decided by the first byte read in on a lexing pass.
4. Implement state machine minimization (may or may not be worthwhile.)
<<END OF README>>

%---------------------------------------------------------------------------
-- %
% test_lex.m
% Copyright (C) 2001 Ralph Becket <rbeck at microsoft.com>
% Sun Aug 20 18:11:42 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix ft=mercury
% 
% Test harness.
%---------------------------------------------------------------------------
-- %

:- module test_lex.

:- interface.

:- import_module io.

:- pred main(io__state::di, io__state::uo) is det.

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

:- implementation.

:- import_module char, string, exception, array, list.
:- import_module lex, lex__regexp, lex__buf, lex__lexeme.

%---------------------------------------------------------------------------
-- %

main -->
    % { TestString = "the cat sat * on the mat" },
    % print("\nTEST STRING: "), print(TestString), nl,
    % { Lexer = lex__init(lexemes, buf__read_from_string) },
    % { State = lex__start(Lexer, TestString) },
    % tokenise_string(State).

    print("\nYOU TYPE, I'LL LEX...\n\n"),
    { Lexer  = lex__init(lexemes, buf__read_from_stdin) },
    { LexerCopy  = lex__init(lexemes, buf__read_from_stdin) },
    print(LexerCopy), nl,
    { Kludge =
        ( pred(IO0::di, IO::uo) is det :-
            State0 = lex__start(Lexer, IO0),
            tokenise_stdin(State0, State),
            IO     = lex__stop(State)
        )
    },
    Kludge.

%---------------------------------------------------------------------------
-- %

:- pred tokenise_stdin(lexer_state(token, io__state),
                lexer_state(token, io__state)).
:- mode tokenise_stdin(di, uo) is det.

tokenise_stdin -->
    lex__read(Result),
    lex__manipulate_source(print(Result)),
    lex__manipulate_source(nl),
    ( if { Result \= eof } then
        tokenise_stdin
      else
        []
    ).

%---------------------------------------------------------------------------
-- %

:- pred tokenise_string(lexer_state(token, string), io__state, io__state).
:- mode tokenise_string(di, di, uo) is det.

tokenise_string(State0) -->
    { lex__read(Result, State0, State) },
    print(Result),
    nl,
    ( if { Result \= eof } then
        tokenise_string(State)
      else
        []
    ).

%---------------------------------------------------------------------------
-- %

:- type token
    --->    noun
    ;       comment
    ;       integer
    ;       real
    ;       verb
    ;       conj
    ;       prep
    ;       punc
    ;       ' '.

:- func lexemes = list(lexeme(annotated_token(token))).

lexemes = [
    lexeme(value(comment),
			(str("%") >> junk)),

    lexeme(value(integer),
			(int)),

    lexeme(value(real),
			(opt(str("-")) >> int >> str(".") >> int >>
             opt((any("eE") >> opt(str("-")) >> int)))),

    lexeme(value(noun), str("cat")),
    lexeme(value(noun), str("dog")),
    lexeme(value(noun), str("rat")),
    lexeme(value(noun), str("mat")),

    lexeme(value(verb),
			(str("sat") \/ str("caught") \/ str("chased"))),

    lexeme(value(conj),
			(str("and") \/ str("then"))),

    lexeme(value(prep),
			(str("the") \/ str("it") \/ str("them") \/ str("to")
\/ str("on"))),

    lexeme(noval(punc),
			(any(".,:;-!?&'""`~_\\[]{}<>@#$%^*()"))),

    lexeme(ignore,
			whitespace)
].

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

%---------------------------------------------------------------------------
-- %
% lex.m
% Ralph Becket <rbeck at microsoft.com>
% Sun Aug 20 09:08:46 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% This module puts everything together, compiling a list of lexemes
% into state machines and turning the input stream into a token stream.
%---------------------------------------------------------------------------
-- %

:- module lex.

:- interface.

:- include_module lex__regexp, lex__lexeme, lex__buf.
:- include_module lex__automata, lex__fna_to_fda.

:- import_module std_util, string, list.
:- import_module lex__lexeme, lex__buf.



:- type annotated_token(Token)
    --->    noval(Token)                % Just return ok(Token) on success.
    ;       value(Token)                % Return ok(Token, String) on
success.
    ;       ignore.                     % Just skip over these tokens.



:- type lexer_result(Token)
    --->    ok(Token)                   % Noval token matched.
    ;       ok(Token, string)           % Value token matched.
    ;       eof                         % End of input.
    ;       error(int).                 % No matches for string at this
offset.



:- type lexer(Token, Source)
    --->    lexer(
                lex_compiled_lexemes    :: list(live_lexeme(Token)),
                lex_buf_read_pred       :: buf_read_pred(Source)
            ).
:- inst lexer
    ==      bound(lexer(ground, buf_read_pred)).
:- inst lexer_u
    ==      bound(lexer(ground, buf_read_pred_u)).



:- type lexer_instance(Token, Source)
    --->    lexer_instance(
                lexi_init_lexemes       :: list(live_lexeme(Token)),
                lexi_live_lexemes       :: list(live_lexeme(Token)),
                lexi_current_winner     :: winner(Token),
                lexi_buf_state          :: buf_state(Source)
            ).
:- inst lexer_instance
    ==      bound(lexer_instance(ground, ground, ground, buf_state)).
:- inst lexer_instance_u
    ==      bound(lexer_instance(ground, ground, ground, buf_state_u)).



:- type live_lexeme(Token)
    ==      compiled_lexeme(annotated_token(Token)).



:- type winner(Token)
    ==      maybe(pair(annotated_token(Token), offset)).



:- type lexer_state(Token, Source).



    % Construct a lexer structure from which we can generate running
    % instances.
    %
:- func init(list(lexeme(annotated_token(Tok))), buf_read_pred(Src)) =
                lexer(Tok, Src).
:- mode init(in, in(buf_read_pred)) = out(lexer) is det.
:- mode init(in, in(buf_read_pred_u)) = out(lexer_u) is det.

    % Generate a running instance of a lexer on some input source.
    % Note that you can't get the input source back until you stop
    % lexing.
    %
:- func start(lexer(Tok, Src), Src) = lexer_state(Tok, Src).
:- mode start(in(lexer_u), di) = uo is det.

:- pred read(lexer_result(Tok), lexer_state(Tok, Src), lexer_state(Tok,
Src)).
:- mode read(out, di, uo) is det.

    % Stop a running instance of a lexer and retrieve the input source.
    %
    % XXX Er, the mode isn't quite right.  The result won't be unique
    % if the input source wasn't.  This is the price one pays for
    % not having nested unique modes and polymorphic modes yet.
    %
:- func stop(lexer_state(_Tok, Src)) = Src.
:- mode stop(di) = uo is det.

    % Sometimes (e.g. when lexing the io__state) you wan't access to the
    % input stream without interrupting the lexing process.  This pred.
    % gives that sort of access.
    %
:- pred manipulate_source(pred(Src, Src),
                lexer_state(_Tok, Src), lexer_state(_Tok, Src)).
:- mode manipulate_source(pred(di, uo) is det, di, uo) is det.

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

:- implementation.

:- include_module lex__automata, lex__fna_to_fda.

:- import_module map, char, bool, int, exception, array.
:- import_module lex__regexp, lex__automata, lex__fna_to_fda.

%---------------------------------------------------------------------------
-- %

init(Lexemes, BufReadPred) = lexer(CompiledLexemes, BufReadPred) :-
    CompiledLexemes = list__map(lexeme__compile_lexeme, Lexemes).

%---------------------------------------------------------------------------
-- %

start(Lexer, Src) = LexerState :-
    init_lexer_instance(Lexer, Instance, Buf),
    args_lexer_state(Instance, Buf, Src, LexerState).

%---------------------------------------------------------------------------
-- %

:- pred init_lexer_instance(lexer(Tok, Src), lexer_instance(Tok, Src), buf).
:- mode init_lexer_instance(in(lexer), out(lexer_instance), array_uo) is
det.
:- mode init_lexer_instance(in(lexer_u),out(lexer_instance_u),array_uo) is
det.

init_lexer_instance(Lexer, Instance, Buf) :-
    buf__init(Lexer ^ lex_buf_read_pred, BufState, Buf),
    InitLexemes = Lexer ^ lex_compiled_lexemes,
    Instance    = lexer_instance(InitLexemes, InitLexemes, no, BufState).

%---------------------------------------------------------------------------
-- %

stop(LexerState) = Src :-
    lexer_state_args(LexerState, _Instance, _Buf, Src).

%---------------------------------------------------------------------------
-- %

read(Result, LexerState0, LexerState) :-
    lexer_state_args(LexerState0, Instance0, Buf0, Src0),
    read_0(Result, Instance0, Instance, Buf0, Buf, Src0, Src),
    args_lexer_state(Instance, Buf, Src, LexerState).



:- pred read_0(lexer_result(Tok),
            lexer_instance(Tok, Src), lexer_instance(Tok, Src),
            buf, buf, Src, Src).
:- mode read_0(out,
            in(lexer_instance), out(lexer_instance),
            array_di, array_uo, in, out) is det.
:- mode read_0(out,
            in(lexer_instance_u), out(lexer_instance_u),
            array_di, array_uo, di, uo) is det.

    % Basically, just read chars from the buf and advance the live lexemes
    % until we have a winner or hit an error (no parse).
    %
read_0(Result, Instance0, Instance, Buf0, Buf, Src0, Src) :-

    BufState0    = Instance0 ^ lexi_buf_state,

    buf__read(BufReadResult, BufState0, BufState1, Buf0, Buf1, Src0, Src1),
    (
        BufReadResult = ok(Char),
        process_char(Result, Char,
                Instance0, Instance, BufState1, Buf1, Buf, Src1, Src)
    ;
        BufReadResult = eof,
        Buf = Buf1,
        Src = Src1,
        process_eof(Result, Instance0, Instance, BufState1, Buf)
    ).

%---------------------------------------------------------------------------
-- %

:- pred process_char(lexer_result(Tok), char,
            lexer_instance(Tok, Src), lexer_instance(Tok, Src),
            buf_state(Src), buf, buf, Src, Src).
:- mode process_char(out, in, in(lexer_instance), out(lexer_instance),
            in(buf_state), array_di, array_uo, in, out) is det.
:- mode process_char(out, in, in(lexer_instance_u), out(lexer_instance_u),
            in(buf_state_u), array_di, array_uo, di, uo) is det.

process_char(Result, Char, Instance0, Instance,
        BufState, Buf0, Buf, Src0, Src) :-

    LiveLexemes0 = Instance0 ^ lexi_live_lexemes,
    Winner0      = Instance0 ^ lexi_current_winner,

    advance_live_lexemes(Char, buf__cursor_offset(BufState),
            LiveLexemes0, LiveLexemes, Winner0, Winner),
    (
        LiveLexemes = [],               % Nothing left to consider.

        process_any_winner(Result, Winner, Instance0, Instance, BufState,
                Buf0, Buf, Src0, Src)
    ;
        LiveLexemes = [_ | _],          % Still some open possibilities.

        Instance1 = (((Instance0
                            ^ lexi_live_lexemes   := LiveLexemes)
                            ^ lexi_current_winner := Winner)
                            ^ lexi_buf_state      := BufState),
        read_0(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
    ).

%---------------------------------------------------------------------------
-- %

:- pred process_any_winner(lexer_result(Tok), winner(Tok),
            lexer_instance(Tok, Src), lexer_instance(Tok, Src), 
            buf_state(Src), buf, buf, Src, Src).
:- mode process_any_winner(out, in,
            in(lexer_instance), out(lexer_instance),
            in(buf_state), array_di, array_uo, in, out) is det.
:- mode process_any_winner(out, in,
            in(lexer_instance_u), out(lexer_instance_u),
            in(buf_state_u), array_di, array_uo, di, uo) is det.

process_any_winner(Result, yes(ATok - Offset), Instance0, Instance,
        BufState0, Buf0, Buf, Src0, Src) :-

    BufState1 = buf__rewind_cursor(Offset, BufState0),
    Instance1 = ((( Instance0
                        ^ lexi_live_lexemes   := Instance0 ^
lexi_init_lexemes)
                        ^ lexi_current_winner := no)
                        ^ lexi_buf_state      := buf__commit(BufState1)),
    (
        ATok     = noval(Token),
        Result   = ok(Token),
        Instance = Instance1,
        Buf      = Buf0,
        Src      = Src0
    ;
        ATok     = value(Token),
        Result   = ok(Token, buf__string_to_cursor(BufState1, Buf)),
        Instance = Instance1,
        Buf      = Buf0,
        Src      = Src0
    ;
        ATok     = ignore,
        read_0(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
    ).

process_any_winner(Result, no, Instance0, Instance,
        BufState0, Buf, Buf, Src, Src) :-

    Start     = buf__start_offset(BufState0),
    BufState1 = buf__rewind_cursor(Start + 1, BufState0),
    Result    = error(Start),
    Instance  = ((( Instance0
                        ^ lexi_live_lexemes   :=
                                Instance0 ^ lexi_init_lexemes)
                        ^ lexi_current_winner := no)
                        ^ lexi_buf_state      := buf__commit(BufState1)).

%---------------------------------------------------------------------------
-- %

:- pred process_eof(lexer_result(Tok),
            lexer_instance(Tok, Src), lexer_instance(Tok, Src),
            buf_state(Src), buf).
:- mode process_eof(out, in(lexer_instance), out(lexer_instance),
            in(buf_state), array_ui) is det.
:- mode process_eof(out, in(lexer_instance_u), out(lexer_instance_u),
            in(buf_state_u), array_ui) is det.

process_eof(Result, Instance0, Instance, BufState, Buf) :-

    ( if

        live_lexeme_in_accepting_state(Instance0 ^ lexi_live_lexemes, ATok)

      then

            % Return the token and set things up so that we return
            % eof next.
        (
            ATok   = noval(Token),
            Result = ok(Token)
        ;
            ATok   = value(Token),
            Result = ok(Token, buf__string_to_cursor(BufState, Buf))
        ;
            ATok   = ignore,
            Result = eof
        )

      else

        Result     = eof
    ),
    Instance  = ((Instance0
                        ^ lexi_live_lexemes := [])
                        ^ lexi_buf_state    := buf__commit(BufState)).

%---------------------------------------------------------------------------
-- %

:- pred advance_live_lexemes(char, offset,
            list(live_lexeme(Token)), list(live_lexeme(Token)),
            winner(Token), winner(Token)).
:- mode advance_live_lexemes(in, in, in, out, in, out) is det.

advance_live_lexemes(_Char, _Offset, [], [], Winner, Winner).

advance_live_lexemes(Char, Offset, [L0 | Ls0], Ls, Winner0, Winner) :-

    State0        = L0 ^ clxm_state,
    ATok          = L0 ^ clxm_token,

    ( if next_state(L0, State0, Char, State, IsAccepting) then

        (
            IsAccepting = no,
            Winner1     = Winner0
        ;
            IsAccepting = yes,
            Winner1     = yes(ATok - Offset)
        ),
        advance_live_lexemes(Char, Offset, Ls0, Ls1, Winner1, Winner),
        Ls = [( L0 ^ clxm_state := State ) | Ls1]

      else

        advance_live_lexemes(Char, Offset, Ls0, Ls, Winner0, Winner)
    ).

%---------------------------------------------------------------------------
-- %

:- pred live_lexeme_in_accepting_state(list(live_lexeme(Tok)),
                annotated_token(Tok)).
:- mode live_lexeme_in_accepting_state(in, out) is semidet.

live_lexeme_in_accepting_state([L | Ls], Token) :-
    ( if in_accepting_state(L)
      then Token = L ^ clxm_token
      else live_lexeme_in_accepting_state(Ls, Token)
    ).

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

    % It's much more convenient (especially for integration with, e.g.
    % parsers such as moose) to package up the lexer_instance, buf
    % and Src in a single object.
    %
    % XXX Unfortunately, we don't yet have nested unique objects, so
    % we have to fake it out with a judicious bit of C.  Forgive me...
    % Note that this code uses non-public parts of the Mercury runtime
    % (e.g. MR_GC_malloc) and is therefore on somewhat dodgy ground.

:- type lexer_state(_Tok, _Src)
    --->    lexer_state(c_pointer).

:- pragma c_header_code("
    struct c_lexer_state {
        MR_Word lexer_instance;
        MR_Word buf;
        MR_Word src;
    };
").

%---------------------------------------------------------------------------
-- %

:- pred args_lexer_state(lexer_instance(Tok,Src),buf,Src,
lexer_state(Tok,Src)).
:- mode args_lexer_state(in(lexer_instance), array_di, in, out) is det.
:- mode args_lexer_state(in(lexer_instance_u), array_di, di, uo) is det.

:- pragma c_code(
    args_lexer_state(RL::in(lexer_instance), B::array_di, S::in, LS::out),
    [will_not_call_mercury],
    "
    LS = (MR_Word) MR_GC_malloc(sizeof(struct c_lexer_state));
    ((struct c_lexer_state *) LS)->lexer_instance = RL;
    ((struct c_lexer_state *) LS)->buf           = B;
    ((struct c_lexer_state *) LS)->src           = S;
").

:- pragma c_code(
    args_lexer_state(RL::in(lexer_instance_u), B::array_di, S::di, LS::uo),
    [will_not_call_mercury],
    "
    LS = (MR_Word) MR_GC_malloc(sizeof(struct c_lexer_state));
    ((struct c_lexer_state *) LS)->lexer_instance = RL;
    ((struct c_lexer_state *) LS)->buf           = B;
    ((struct c_lexer_state *) LS)->src           = S;
").

%---------------------------------------------------------------------------
-- %

:- pred
lexer_state_args(lexer_state(Tok,Src),lexer_instance(Tok,Src),buf,Src).
:- mode lexer_state_args(in, out(lexer_instance),   array_uo, out) is det.
:- mode lexer_state_args(di, out(lexer_instance_u), array_uo, uo)  is det.

:- pragma c_code(
    lexer_state_args(LS::in, RL::out(lexer_instance), B::array_uo, S::out),
    [will_not_call_mercury],
    "
    RL = ((struct c_lexer_state *) LS)->lexer_instance;
    B  = ((struct c_lexer_state *) LS)->buf;
    S  = ((struct c_lexer_state *) LS)->src;
").

:- pragma c_code(
    lexer_state_args(LS::di, RL::out(lexer_instance_u), B::array_uo, S::uo),
    [will_not_call_mercury],
    "
    RL = ((struct c_lexer_state *) LS)->lexer_instance;
    B  = ((struct c_lexer_state *) LS)->buf;
    S  = ((struct c_lexer_state *) LS)->src;
").

%---------------------------------------------------------------------------
-- %

manipulate_source(P, State0, State) :-
    lexer_state_args(State0, Instance, Buf, Src0),
    P(Src0, Src),
    args_lexer_state(Instance, Buf, Src, State).

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

%---------------------------------------------------------------------------
---%
% lex.automata.m
% Ralph Becket <rbeck at microsoft.com>
% Fri Aug 18 15:48:09 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
% 
% Basic types and insts etc. for FDAs and FNAs over chars.
%---------------------------------------------------------------------------
---%

:- module lex__automata.

:- interface.

:- import_module set, list, char.



    % States are labelled with non-negative integers.
    %
:- type state
    ==      int.

:- type state_mc
    --->    state_mc(
                smc_start_state         :: state,
                smc_stop_states         :: set(state),
                smc_state_transitions   :: list(transition)
            ).

:- inst null_transition_free_state_mc
    ==      bound(state_mc(ground, ground, atom_transitions)).

:- type transitions
    ==      list(transition).

:- inst atom_transitions == list_skel(atom_transition).
:- inst null_transitions == list_skel(null_transition).

:- type transition
    --->    null(state, state)
    ;       trans(state, char, state).

:- inst atom_transition == bound(trans(ground, ground, ground)).
:- inst null_transition == bound(null(ground, ground)).

%---------------------------------------------------------------------------
---%
%---------------------------------------------------------------------------
---%

%
----------------------------------------------------------------------------
%
% lex.buf.m
% Ralph Becket <rbeck at microsoft.com>
% Sat Aug 19 16:56:30 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% This module implements the rolling char buffer.  The char
% buffer is optimised for efficiency.
%
% The buffer stores chars read from an input source (e.g. IO
% or string).  Because the lexer can want to `unread' chars
% (when a long candidate lexeme fails), the buffer may
% contain `read ahead' chars.  The structure of the buffer
% is as follows:
%
%    buf[0]                                       buf[len]
%    |                  len = end - start                |
%    v                                                   v
%   +---------------------------------------------------+
%   |.|.|.|.|.|a|b|c|d|e|f|g|h|i|j|k|l| | | | | | | | | |
%   +---------------------------------------------------+
%    ^         ^           ^           ^                 ^ 
%    |         |           |           |                 |
%    origin    start       cursor      end        terminus
%
% origin, start etc. are all recorded in terms of offsets
% (number of chars) from the start of the input stream,
% counting the first char read as being at offset 1.  Hence,
% the char at the cursor is at buf[cursor - origin].
%
% READING CHARS
%
% * In the diagram, `g' is the next char that will be read.
%
% Thu cursor marks the point of the next char to be read in.
%
% If the cursor advances to the end, then a new char is read
% from the input and inserted into the buffer at the end and
% the end marker is incremented.
%
% If the end marker advances to the terminus, then the
% buffer is extended and the terminus adjusted
% appropriately.  The buffer may take this opportunity to
% garbage collect the inaccessible chars between the origin
% and the start marker.
%
% EOF
%
% * In the diagram, if EOF had been detected then the end
% marker would give the offset at which it occurred.
%
% When EOF is read from the input stream, a special eof flag
% is set (and the end marker, of course, will give its offset).
% Any attempt to read at or past this point will cause the
% buffer to return the EOF signal.
%
% REWINDING
%
% * In the diagram, the cursor may be rewound to any point
% between the start marker and itself, inclusive.
%
% At any point, the cursor may be reset to any point between
% itself and the start marker inclusive.
%
%
% At any point, the user may ask for the offset of the cursor.
%
% STRING EXTRACTION
%
% * In the diagram, the string read in so far is "abcdef".
%
% The buffer provides a facility to return the substring
% consisting of the chars between the start marker and up
% to, but not including, that under the cursor.
%
% COMMITS
%
% * In the diagram, a commit will move the start marker to
% be the same as the cursor.
%
% The user can issue a commit order to the buffer which
% moves the start pointer to where the cursor is, preventing
% rewinds back past this point.  This is important since it
% means that the region prior to the cursor in the buffer is
% now available for garbage collection.
%
%
----------------------------------------------------------------------------
%

:- module lex__buf.

:- interface.

:- import_module int, array, char, bool, string, io.



:- type offset
    ==      int.

    % XXX I suspect (from looking at the source) that array.m
    % allocates one word per char for a char array.  This, of
    % course, is shockingly wasteful.  Should we have a
    % dedicated char_array data type (and bool_array and ...)
    % or should array.m do packing where possible?
    %
:- type buf
    ==      array(char).

    % T is the type of the input source (typically io__state or string);
    % the user must initialise the buffer by specifying an appropriate
    % read predicate.
    %
:- type buf_state(T)
    --->    buf_state(
                buf_origin                  :: offset,
                buf_start                   :: offset,
                buf_cursor                  :: offset,
                buf_end                     :: offset,
                buf_terminus                :: offset,
                buf_eof_seen                :: bool,    % If `yes' then
buf_end
                                                        % has the offset
                buf_read_pred               :: buf_read_pred(T)
            ).
:- inst buf_state
    ==      bound(buf_state(ground, ground, ground, ground, ground, ground,
                                buf_read_pred)).
:- inst buf_state_u
    ==      bound(buf_state(ground, ground, ground, ground, ground, ground,
                                buf_read_pred_u)).

    % buf_read_pred(Offset, Result, SrcIn, SrcOut) reads the char at
    % Offset from SrcIn and returns SrcOut.
    %
:- type buf_read_pred(T)
    ==      pred(offset, buf_read_result, T, T).
:- inst buf_read_pred
    ==      ( pred(in, out, in, out) is det ).
:- inst buf_read_pred_u
    ==      ( pred(in, out, di, uo) is det ).

    % Any errors should be reported by raising an exception.
    %
:- type buf_read_result
    --->    ok(char)
    ;       eof.



    % Returns an empty buffer and an initialised buf_state.
    %
:- pred init(buf_read_pred(T), buf_state(T), buf).
:- mode init(in(buf_read_pred), out(buf_state), array_uo) is det.
:- mode init(in(buf_read_pred_u), out(buf_state_u), array_uo) is det.

    % Reads the next char and advances the cursor.  Updates the
    % buf_state, the buf and the input.
    %
:- pred read(buf_read_result, buf_state(T), buf_state(T), buf, buf, T, T).
:- mode read(out, in(buf_state), out(buf_state), array_di, array_uo,
                    in, out) is det.
:- mode read(out, in(buf_state_u), out(buf_state_u), array_di, array_uo,
                    di, uo) is det.

    % Returns the offset of the start marker.
    %
:- func start_offset(buf_state(T)) = offset.
:- mode start_offset(in(buf_state)) = out is det.
:- mode start_offset(in(buf_state_u)) = out is det.

    % Returns the offset of the cursor.
    %
:- func cursor_offset(buf_state(T)) = offset.
:- mode cursor_offset(in(buf_state)) = out is det.
:- mode cursor_offset(in(buf_state_u)) = out is det.

    % Rewinds the buffer.  An exception is raised if the offset provided
    % is not legitimate.
    %
:- func rewind_cursor(offset, buf_state(T)) = buf_state(T).
:- mode rewind_cursor(in, in(buf_state)) = out(buf_state) is det.
:- mode rewind_cursor(in, in(buf_state_u)) = out(buf_state_u) is det.

    % Extracts the string of chars between the start and cursor.
    %
:- func string_to_cursor(buf_state(T), buf) = string.
:- mode string_to_cursor(in(buf_state), array_ui) = out is det.
:- mode string_to_cursor(in(buf_state_u), array_ui) = out is det.

    % Advances the start marker to the cursor.  Rewinds past the
    % cursor will therefore no longer be possible.
    %
:- func commit(buf_state(T)) = buf_state(T).
:- mode commit(in(buf_state)) = out(buf_state) is det.
:- mode commit(in(buf_state_u)) = out(buf_state_u) is det.

    % Handy read predicates.

% :- func read_from_stdin = buf_read_pred(io__state).
% :- mode read_from_stdin = out(buf_read_pred_u) is det.
% 
% :- func read_from_string = buf_read_pred(string).
% :- mode read_from_string = out(buf_read_pred) is det.

:- pred read_from_stdin(offset, buf_read_result, io__state, io__state).
:- mode read_from_stdin(in, out, di, uo) is det.

:- pred read_from_string(offset, buf_read_result, string, string).
:- mode read_from_string(in, out, in, out) is det.

%
----------------------------------------------------------------------------
%
%
----------------------------------------------------------------------------
%

:- implementation.

:- import_module exception.



    % The amount the buffer is grown by if (a) more space is
    % required and (b) the available space is smaller than
    % this amount.
    %
:- func low_water_mark = int.
low_water_mark = 256.

:- func initial_buf_size = int.
initial_buf_size = 1024.

    % XXX Debugging values.
    % %
% :- func low_water_mark = int.
% low_water_mark = 16.
% 
% :- func initial_buf_size = int.
% initial_buf_size = 32.

%
----------------------------------------------------------------------------
%

init(BufReadPred, BufState, Buf) :-
    BufState = buf_state(0, 0, 0, 0, initial_buf_size, no, BufReadPred),
    Buf      = array__init(initial_buf_size, ('@')).

%
----------------------------------------------------------------------------
%

read(Result, BufState0, BufState, Buf0, Buf, Src0, Src) :-

    Origin   = BufState0 ^ buf_origin,
    Start    = BufState0 ^ buf_start,
    Cursor   = BufState0 ^ buf_cursor,
    End      = BufState0 ^ buf_end,
    Terminus = BufState0 ^ buf_terminus,
    EOFSeen  = BufState0 ^ buf_eof_seen,
    ReadP    = BufState0 ^ buf_read_pred,

    ( if Cursor < End then

        Result   = ok(array__lookup(Buf0, Cursor - Origin)),
        BufState = ( BufState0 ^ buf_cursor := Cursor + 1 ),
        Buf      = Buf0,
        Src      = Src0

      else /* Cursor = End */ if EOFSeen = yes then

        Result   = eof,
        BufState = BufState0,
        Buf      = Buf0,
        Src      = Src0

      else if End < Terminus then

        ReadP(Cursor, Result, Src0, Src),

        ( if Result = ok(Char) then

            Buf = array__set(Buf0, End - Origin, Char),
            BufState = (( BufState0
                                ^ buf_cursor := Cursor + 1 )
                                ^ buf_end    := End + 1 )
          else

            Buf      = Buf0,
            BufState = BufState0
        )

      else /* Need to GC and/or extend the buffer */

        GarbageLength = Start - Origin,
        adjust_buf(GarbageLength, ExtraLength, Buf0, Buf1),
        NewOrigin     = Origin + GarbageLength,
        NewTerminus   = Terminus + GarbageLength + ExtraLength,
        BufState1     = (( BufState0
                                ^ buf_origin   := NewOrigin )
                                ^ buf_terminus := NewTerminus ),
        read(Result, BufState1, BufState, Buf1, Buf, Src0, Src)
    ).

%
----------------------------------------------------------------------------
%

    % Garbage collects the chars between the origin and start and
    % extends the buffer if the remaining space is below the low
    % water mark.
    %
:- pred adjust_buf(int, int, buf, buf).
:- mode adjust_buf(in, out, array_di, array_uo) is det.

adjust_buf(GarbageLength, ExtraLength, Buf0, Buf) :-

    Size0 = array__size(Buf0),

    ( if GarbageLength < low_water_mark then /* We need to grow the buffer
*/
        array__init(Size0 + low_water_mark, ('@'), Buf1),
        ExtraLength = low_water_mark
      else
        Buf1 = Buf0,
        ExtraLength = 0
    ),

    Buf = shift_buf(0, Size0 - GarbageLength, GarbageLength, Buf0, Buf1).

%
----------------------------------------------------------------------------
%

:- func shift_buf(int, int, int, buf, buf) = buf.
:- mode shift_buf(in, in, in, array_ui, array_di) = array_uo is det.

shift_buf(I, Hi, Disp, Src, Tgt) =
    ( if I < Hi then
        shift_buf(I + 1, Hi, Disp, Src,
            array__set(Tgt, I, array__lookup(Src, I + Disp)))
      else
        Tgt
    ).

%
----------------------------------------------------------------------------
%

start_offset(BufState) = BufState ^ buf_start.

%
----------------------------------------------------------------------------
%

cursor_offset(BufState) = BufState ^ buf_cursor.

%
----------------------------------------------------------------------------
%

rewind_cursor(Offset, BufState) =
    ( if ( Offset < BufState ^ buf_start ; BufState ^ buf_cursor < Offset )
then
        throw("buf: rewind/2: offset arg outside valid range")
      else
        BufState ^ buf_cursor := Offset
    ).

%
----------------------------------------------------------------------------
%

string_to_cursor(BufState, Buf) = String :-
    From   = BufState ^ buf_start - BufState ^ buf_origin,
    Length = (BufState ^ buf_cursor - 1 - BufState ^ buf_start),
    To     = From + Length,
    String = string__from_char_list(array__fetch_items(Buf, From, To)).

%
----------------------------------------------------------------------------
%

commit(BufState) = ( BufState ^ buf_start := BufState ^ buf_cursor ).

%
----------------------------------------------------------------------------
%

read_from_stdin(_Offset, Result) -->
    io__read_char(IOResult),
    { IOResult = ok(Char),              Result = ok(Char)
    ; IOResult = eof,                   Result = eof
    ; IOResult = error(E),              io__error_message(E, M), throw(M)
    }.

%
----------------------------------------------------------------------------
%

read_from_string(Offset, Result, String, String) :-
    ( if Offset < string__length(String) then
        Result = ok(string__unsafe_index(String, Offset))
      else
        Result = eof
    ).

%
----------------------------------------------------------------------------
%
%
----------------------------------------------------------------------------
%

%---------------------------------------------------------------------------
--
% lex.fna_to_fda.m
% Ralph Becket <rbeck at microsoft.com>
% Fri Aug 18 12:30:25 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% Powerset construction used to transform FNAs into FDAs.
%---------------------------------------------------------------------------
-- %

:- module lex__fna_to_fda.

:- interface.

:- import_module lex__automata.



:- func fna_to_fda(state_mc) = state_mc.
:- mode fna_to_fda(in(null_transition_free_state_mc)) =
            out(null_transition_free_state_mc) is det.

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

:- implementation.

:- import_module counter, list, set, map, char, int.



:- type state_sets
    ==      set(state_set).

:- type state_set
    ==      set(state).

:- type state_set_transitions
    ==      list(state_set_transition).

:- type state_set_transition
    --->    trans(state_set, char, state_set).

:- type state_set_no_map
    ==      map(state_set, int).

%---------------------------------------------------------------------------
-- %

fna_to_fda(FNA) = FNA :-

        % An FNA with no transitions is probably a bug...

    FNA ^ smc_state_transitions = [].

fna_to_fda(FNA) = FDA :-

    FNA ^ smc_state_transitions = [_ | _],

        % Do some unpacking of the FNA.
        %
    FNAStopStates    = FNA ^ smc_stop_states,
    FNATransitions   = FNA ^ smc_state_transitions,
    FDAStartStateSet = set__make_singleton_set(FNA ^ smc_start_state),
    FDAStartStateSets = set__make_singleton_set(FDAStartStateSet),

        % Calculate the powerset version of the FDA from the FNA.
        %
    compute_fda_state_sets_and_transitions(
        FNATransitions, FDAStartStateSets,
        FDAStartStateSets,  FDAStateSets,
        [],                 FDAStateSetTransitions
    ),
    FDAStopStateSets = compute_fda_stop_state_sets(FNAStopStates,
FDAStateSets),

        % Replace the powerset state identifiers with numbers.
        %
    FDAStateNos      = number_state_sets(FDAStateSets),
    map__lookup(FDAStateNos, FDAStartStateSet, FDAStartState),
    FDAStopStates0   = list__map(
                            map__lookup(FDAStateNos),
                            set__to_sorted_list(FDAStopStateSets)
                       ),
    FDAStopStates    = set__list_to_set(FDAStopStates0),
    FDATransitions   = map_state_set_transitions_to_numbers(
                            FDAStateNos,
                            FDAStateSetTransitions
                       ),

        % Pack up the result.
        %
    FDA = state_mc(FDAStartState, FDAStopStates, FDATransitions).

%---------------------------------------------------------------------------
-- %

    % If S is a state set, then S -c-> S' where
    % S' = {y | x in S, x -c-> y}
    %
    % We iterate to the least fixed point starting with the start
    % state set.
    %
:- pred compute_fda_state_sets_and_transitions(
            transitions, state_sets,
            state_sets, state_sets,
            state_set_transitions, state_set_transitions).
:- mode compute_fda_state_sets_and_transitions(in, in, in, out, in, out) is
det.

compute_fda_state_sets_and_transitions(Ts, NewSs0, Ss0, Ss, STs0, STs) :-

    ( if set__empty(NewSs0) then

        Ss   = Ss0,
        STs0 = STs

      else

        NewSTs =
            list__condense(
 
list__map(state_set_transitions(Ts),set__to_sorted_list(NewSs0))
            ),
        STs1 = list__append(NewSTs, STs0),

        TargetSs =
            set__list_to_set(
                list__map(( func(trans(_, _, S)) = S ), NewSTs)
            ),
        NewSs = TargetSs `set__difference` Ss0,
        Ss1   = NewSs `set__union` Ss0,

        compute_fda_state_sets_and_transitions(Ts, NewSs, Ss1, Ss, STs1,
STs)
    ).

%---------------------------------------------------------------------------
-- %

    % Given a state set and a set of transition chars for that
    % state set, find the set of state set transitions (said
    % Peter Piper):
    %
    % state_set_transitions(S) = {S -c-> S' | x in S, S' = {y | x -c-> y}}
    %
:- func state_set_transitions(transitions, state_set) =
state_set_transitions.

state_set_transitions(Ts, S) = STs :-
    TCs = transition_chars(Ts, S),
    STs = list__map(state_set_transition(Ts, S), TCs).

%---------------------------------------------------------------------------
-- %

    % Given a state set, find all the transition chars:
    %
    % transition_chars(S) = {c | x in S, some [y] x -c-> y}
    %
:- func transition_chars(transitions, state_set) = list(char).

transition_chars(Ts, S) =
    list__sort_and_remove_dups(
        list__condense(
            list__map(transition_chars_for_state(Ts),
set__to_sorted_list(S))
        )
    ).

%---------------------------------------------------------------------------
-- %

:- func transition_chars_for_state(transitions, state) = list(char).
:- mode transition_chars_for_state(in, in) = out is det.

transition_chars_for_state(Ts, X) =
    list__filter_map(transition_char_for_state(X), Ts).

%---------------------------------------------------------------------------
-- %

:- func transition_char_for_state(state, transition) = char.
:- mode transition_char_for_state(in, in) = out is semidet.

transition_char_for_state(X, trans(X, C, _Y)) = C.

%---------------------------------------------------------------------------
-- %

    % Given a state set and a char, find the state set transition:
    %
    % state_set_transition(S, c) = S -c-> target_state_set(S, c)
    %
:- func state_set_transition(transitions, state_set, char) =
            state_set_transition.

state_set_transition(Ts, S, C) = trans(S, C, target_state_set(Ts, S, C)).

%---------------------------------------------------------------------------
-- %

    % Given a state set and a char, find the target state set:
    %
    % target_state_set(S, c) = {y | x in S, x -c-> y}
    %
:- func target_state_set(transitions, state_set, char) = state_set.

target_state_set(Ts, S, C) =
    set__power_union(set__map(target_state_set_0(Ts, C), S)).

%---------------------------------------------------------------------------
-- %

:- func target_state_set_0(transitions, char, state) = state_set.

target_state_set_0(Ts, C, X) =
    set__list_to_set(list__filter_map(target_state(X, C), Ts)).

%---------------------------------------------------------------------------
-- %

:- func target_state(state, char, transition) = state.
:- mode target_state(in, in, in) = out is semidet.

target_state(X, C, trans(X, C, Y)) = Y.

%---------------------------------------------------------------------------
-- %

:- func compute_fda_stop_state_sets(state_set, state_sets) = state_sets.

compute_fda_stop_state_sets(StopStates, StateSets) =
    set__filter_map(stop_state_set(StopStates), StateSets).

%---------------------------------------------------------------------------
-- %

:- func stop_state_set(state_set, state_set) = state_set.
:- mode stop_state_set(in, in) = out is semidet.

stop_state_set(StopStates, StateSet) = StateSet :-
    not set__empty(StopStates `set__intersect` StateSet).

%---------------------------------------------------------------------------
-- %

:- func number_state_sets(state_sets) = state_set_no_map.

number_state_sets(Ss) = StateNos :-
    list__foldl2(
        ( pred(S::in, N::in, (N + 1)::out, Map0::in, Map::out) is det :-
            Map = map__set(Map0, S, N)
        ),
        set__to_sorted_list(Ss),
        0,          _,
        map__init,  StateNos
    ).

%---------------------------------------------------------------------------
-- %

:- func map_state_set_transitions_to_numbers(state_set_no_map,
            state_set_transitions
        ) = transitions.
:- mode map_state_set_transitions_to_numbers(in, in) =
            out(atom_transitions).

map_state_set_transitions_to_numbers(Map, STs) =
    list__map(
        ( func(trans(SX, C, SY)) = trans(X, C, Y) :-
            X = map__lookup(Map, SX),
            Y = map__lookup(Map, SY)
        ),
        STs
    ).

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

%---------------------------------------------------------------------------
--
% lex.lexeme.m
% Ralph Becket <rbeck at microsoft.com>
% Sat Aug 19 08:22:32 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% A lexeme combines a token with a regexp.  The lexer compiles
% lexemes and returns the longest successul parse in the input
% stream or an error if no match occurs.
%---------------------------------------------------------------------------
-- %

:- module lex__lexeme.

:- interface.

:- import_module bool, char, array, bitmap.
:- import_module lex__automata, lex__regexp.



:- type lexeme(T)
    --->    lexeme(
                lxm_token               :: T,
                lxm_regexp              :: regexp
            ).

:- type compiled_lexeme(T)
    --->    compiled_lexeme(
                clxm_token              :: T,
                clxm_state              :: state,
                clxm_transition_map     :: transition_map
            ).

:- type transition_map
    --->    transition_map(
                trm_accepting_states    :: bitmap,
                trm_rows                :: array(row)
            ).

    % A transition row is an array of transitions ordered by char
    % code.
    %
:- type row
    ==      array(byte_transition).

    % A byte_transition encodes a target state no. in its upper bits
    % and the char byte value in its lower eight bits for which the
    % transition is valid.
    %
:- type byte_transition
    ==      int.



:- func compile_lexeme(lexeme(T)) = compiled_lexeme(T).

    % next_state(CLXM, CurrentState, Char, NextState, IsAccepting)
    % succeeds iff there is a transition in CLXM from CurrentState
    % to NextState via Char; IsAccepting is `yes' iff NextState is
    % an accepting state.
    %
:- pred next_state(compiled_lexeme(T), state, char, state, bool).
:- mode next_state(in, in, in, out, out) is semidet.

    % Succeeds iff a compiled_lexeme is in an accepting state.
    %
:- pred in_accepting_state(compiled_lexeme(T)).
:- mode in_accepting_state(in) is semidet.

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

:- implementation.

:- import_module list, set.
:- import_module lex__automata, lex__fna_to_fda.

%---------------------------------------------------------------------------
---%

compile_lexeme(Lexeme) = CompiledLexeme :-
    Lexeme         = lexeme(Token, RegExp),
    FNA            = remove_null_transitions(regexp_to_fna(RegExp)),
    FDA            = fna_to_fda(FNA),
    StartState     = FDA ^ smc_start_state,
    StopStates     = FDA ^ smc_stop_states,
    Transitions    = FDA ^ smc_state_transitions,
    N              = 1 + find_top_state(Transitions),
    Accepting      = set_accepting_states(StopStates, bitmap__new(N, no)),
    Rows           = array(set_up_rows(0, N, Transitions)),
    TransitionMap  = transition_map(Accepting, Rows),
    CompiledLexeme = compiled_lexeme(Token, StartState, TransitionMap).

%---------------------------------------------------------------------------
---%

:- func find_top_state(transitions) = int.
:- mode find_top_state(in(atom_transitions)) = out is det.

find_top_state([])                    = 0.
find_top_state([trans(X, _, Y) | Ts]) = max(X, max(Y, find_top_state(Ts))).

%---------------------------------------------------------------------------
---%

:- func set_accepting_states(set(state), bitmap) = bitmap.
:- mode set_accepting_states(in, bitmap_di) = bitmap_uo is det.

set_accepting_states(States, Bitmap0) =
    set_accepting_states_0(set__to_sorted_list(States), Bitmap0).



:- func set_accepting_states_0(list(state), bitmap) = bitmap.
:- mode set_accepting_states_0(in, bitmap_di) = bitmap_uo is det.

set_accepting_states_0([], Bitmap) = Bitmap.

set_accepting_states_0([St | States], Bitmap) =
    set_accepting_states_0(States, bitmap__set(Bitmap, St)).

%---------------------------------------------------------------------------
---%

:- func set_up_rows(int, int, transitions) = list(row).
:- mode set_up_rows(in, in, in(atom_transitions)) = out is det.

set_up_rows(I, N, Transitions0) = Rows :-
    ( if I >= N then
        Rows = []
      else
        compile_transitions_for_state(I, [], RowI,
Transitions0,[],Transitions),
        Rows = [RowI | set_up_rows(I + 1, N, Transitions)]
    ).

%---------------------------------------------------------------------------
---%

:- pred compile_transitions_for_state(int, list(byte_transition), row,
            transitions, transitions, transitions).
:- mode compile_transitions_for_state(in, in, array_uo,
            in(atom_transitions), in(atom_transitions),
out(atom_transitions))
                    is det.

    % XXX Wrong - need to sort on chars.
    % Use parameterised list__sort.
compile_transitions_for_state(_, IBTs, Row, [], Ts, Ts) :-
    Row = array(list__sort(groovy_cmp, IBTs)).

compile_transitions_for_state(I, IBTs0, Row, [T | Ts0], Ts1, Ts) :-
    ( if T = trans(I, C, Y) then
        IBTs = [(Y << 8) \/ char__to_int(C) | IBTs0],
        compile_transitions_for_state(I, IBTs, Row, Ts0, Ts1, Ts)
      else
        compile_transitions_for_state(I, IBTs0, Row, Ts0, [T | Ts1], Ts)
    ).



:- func groovy_cmp(int, int) = comparison_result.

groovy_cmp(X, Y) = R :-
    compare(R, X /\ 0xff, Y /\ 0xff).

%---------------------------------------------------------------------------
---%

next_state(CLXM, CurrentState, Char, NextState, IsAccepting) :-
    Rows            = CLXM ^ clxm_transition_map ^ trm_rows,
    AcceptingStates = CLXM ^ clxm_transition_map ^ trm_accepting_states,
    find_next_state(char__to_int(Char), Rows ^ elem(CurrentState),
NextState),
    IsAccepting     = bitmap__get(AcceptingStates, NextState).

%---------------------------------------------------------------------------
---%

:- pred find_next_state(int, array(byte_transition), state).
:- mode find_next_state(in, in, out) is semidet.

find_next_state(Byte, ByteTransitions, State) :-
    Lo = array__min(ByteTransitions),
    Hi = array__max(ByteTransitions),
    find_next_state_0(Lo, Hi, Byte, ByteTransitions, State).



:- pred find_next_state_0(int, int, int, array(byte_transition), state).
:- mode find_next_state_0(in, in, in, in, out) is semidet.

find_next_state_0(Lo, Hi, Byte, ByteTransitions, State) :-
    Lo =< Hi,
    Mid            = (Lo + Hi) // 2,
    ByteTransition = ByteTransitions ^ elem(Mid),
    ThisByte       = ByteTransition /\ 0xff,
    compare(R, Byte, ThisByte),
    (   R = (<),    find_next_state_0(Lo, Mid - 1, Byte, ByteTransitions,
State)
    ;   R = (=),    State = ByteTransition `unchecked_right_shift` 8
    ;   R = (>),    find_next_state_0(Mid + 1, Hi, Byte, ByteTransitions,
State)
    ).

%---------------------------------------------------------------------------
---%

in_accepting_state(CLXM) :-
    bitmap__is_set(
        CLXM ^ clxm_transition_map ^ trm_accepting_states,
        CLXM ^ clxm_state
    ).

%---------------------------------------------------------------------------
---%
%---------------------------------------------------------------------------
---%

%---------------------------------------------------------------------------
-- %
% lex.regexp.m
% Ralph Becket <rbeck at microsoft.com>
% Fri Aug 18 06:43:09 BST 2000
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% Converts basic regular expressions into finite non-deterministic
% automata.
%
%---------------------------------------------------------------------------
-- %

:- module lex__regexp.

:- interface.

:- import_module char.
:- import_module lex__automata.



:- type regexp
    --->    null                % The empty regexp
    ;       atom(char)          % Match a single char
    ;       (regexp >> regexp)  % Concatenation
    ;       (regexp \/ regexp)  % Alternation
    ;       star(regexp)        % Kleene closure
    .
    % Some basic non-primitive regexps.

:- func str(string) = regexp.   % str("abc") = atom(a) >> atom(b) >> atom(c)
:- func any(string) = regexp.   % any("abc") = atom(a) \/ atom(b) \/ atom(c)
:- func anybut(string) = regexp.% anybut("abc") is complement of any("abc")
:- func opt(regexp) = regexp.   % opt(R)     = R \/ null
:- func plus(regexp) = regexp.  % plus(R)    = R \/ star(R)

    % Some useful single-char regexps.

:- func digit = regexp.         % digit      = any("0123456789")
:- func lower = regexp.         % lower      = any("abc...z")
:- func upper = regexp.         % upper      = any("ABC...Z")
:- func alpha = regexp.         % alpha      = lower \/ upper
:- func alphanum = regexp.      % alphanum   = alpha \/ digit
:- func identstart = regexp.    % identstart = alpha \/ str("_")
:- func ident = regexp.         % ident      = alphanum \/ str("_")
:- func nl = regexp.            % nl         = str("\n")
:- func tab = regexp.           % tab        = str("\t")
:- func spc = regexp.           % spc        = str(" ")
:- func wspc = regexp.          % wspc       = any(" \t\n\r\f\v")
:- func dot = regexp.           % dot        = any("<except \n>")

    % Some useful compound regexps.

:- func int = regexp.           % int        = plus(digit)
:- func signed_int = regexp.    % signed_int = opt(any("+-")) >> int
:- func identifier = regexp.    % identifier = identstart >> star(ident)
:- func whitespace = regexp.    % whitespace = star(wspc)
:- func junk = regexp.          % junk       = star(dot)



    % Turn a regexp into an FNA.
    %
:- func regexp_to_fna(regexp) = state_mc.

    % Turn an FNA into a null transition-free FNA.
    %
:- func remove_null_transitions(state_mc) = state_mc.
:- mode remove_null_transitions(in) = out(null_transition_free_state_mc) is
det.

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %

:- implementation.

:- import_module counter, map, assoc_list, std_util, list, set, string.

%---------------------------------------------------------------------------
-- %

regexp_to_fna(R) = FNA :-
    C0 = counter__init(0),
    counter__allocate(Start, C0, C1),
    counter__allocate(Stop,  C1, C),
    compile(Start, R, Stop, Transitions, C, _),
    FNA = state_mc(Start, set__make_singleton_set(Stop), Transitions).

%---------------------------------------------------------------------------
-- %

:- pred compile(state, regexp, state, transitions, counter, counter).
:- mode compile(in, in, in, out, in, out) is det.

    % The primitive regexps.

compile(X, null,       Y, [null(X, Y)]) --> [].

compile(X, atom(C),    Y, [trans(X, C, Y)]) --> [].

compile(X, (RA >> RB), Y, TsA ++ TsB) -->
    counter__allocate(Z),
    compile(X, RA, Z, TsA),
    compile(Z, RB, Y, TsB).

compile(X, (RA \/ RB), Y, TsA ++ TsB) -->
    compile(X, RA, Y, TsA),
    compile(X, RB, Y, TsB).

compile(X, star(R),    Y, TsA ++ TsB) -->
    compile(X, null, Y, TsA),
    compile(X, R,    X, TsB).

%---------------------------------------------------------------------------
-- %
% Some basic non-primitive regexps.

str(S) = R :-
    ( if S = "" then
        R = null
      else
        L = string__length(S),
        C = string__index_det(S, L - 1),
        R = str_foldr(func(Cx, Rx) = (atom(Cx) >> Rx), S, atom(C), L - 2)
    ).

any(S) = R :-
    ( if S = "" then
        R = null
      else
        L = string__length(S),
        C = string__index_det(S, L - 1),
        R = str_foldr(func(Cx, Rx) = (atom(Cx) \/ Rx), S, atom(C), L - 2)
    ).

anybut(S0) = R :-
    S = string__from_char_list(
            list__filter_map(
                ( func(X) = C is semidet :-
                    char__to_int(C, X),
                    not string__contains_char(S0, C)
                ),
                0x01 `..` 0xff
            )
        ),
    R = any(S).

:- func str_foldr(func(char, T) = T, string, T, int) = T.

str_foldr(Fn, S, X, I) =
    ( if I < 0 then X
               else str_foldr(Fn, S, Fn(string__index_det(S, I), X), I - 1)
    ).

opt(R)  = (R \/ null).

plus(R) = (R >> star(R)).

%---------------------------------------------------------------------------
-- %
% Some useful single-char regexps.

:- pragma memo(digit/0).
:- pragma memo(lower/0).
:- pragma memo(upper/0).
:- pragma memo(wspc/0).
:- pragma memo(dot/0).

digit      = any("0123456789").
lower      = any("abcdefghijklmnopqrstuvwxyz").
upper      = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
wspc       = any(" \t\n\r\f\v").
dot        = anybut("\n").
alpha      = (lower \/ upper).
alphanum   = (alpha \/ digit).
identstart = (alpha \/ atom('_')).
ident      = (alphanum \/ atom('_')).
nl         = atom('\n').
tab        = atom('\t').
spc        = atom(' ').

%---------------------------------------------------------------------------
-- %
% Some useful compound regexps.

int        = plus(digit).
signed_int = (opt(any("+-")) >> int).
identifier = (identstart >> star(ident)).
whitespace = star(wspc).
junk       = star(dot).

%---------------------------------------------------------------------------
-- %

    % If we have a non-looping null transition from X to Y then
    % we need to add all the transitions from Y to X.
    %
    % We do this by first finding the transitive closure of the
    % null transition graph and then, for each edge X -> Y in that
    % graph, adding X -C-> Z for all C and Z s.t. Y -C-> Z.
    %
remove_null_transitions(FNA0) = FNA :-

    Ts = FNA0 ^ smc_state_transitions,
    split_transitions(Ts, NullTs, CharTs),
    trans_closure(NullTs, map__init, _Ins, map__init, Outs),
    NullFreeTs = add_atom_transitions(Outs, CharTs),

    StopStates0 = FNA0 ^ smc_stop_states,
    StopStates1 =
        set__list_to_set(
            list__filter_map(
                nulls_to_stop_state(Outs, FNA0 ^ smc_stop_states),
                NullTs
            )
        ),
    StopStates  = StopStates0 `set__union` StopStates1,

    FNA = (( FNA0
                ^ smc_state_transitions := NullFreeTs )
                ^ smc_stop_states       := StopStates).

%---------------------------------------------------------------------------
-- %

:- pred split_transitions(transitions, transitions, transitions).
:- mode split_transitions(in, out(null_transitions), out(atom_transitions)).

split_transitions([], [], []).

split_transitions([null(X, Y) | Ts], [null(X, Y) | NTs], CTs) :-
    split_transitions(Ts, NTs, CTs).

split_transitions([trans(X, C, Y) | Ts], NTs, [trans(X, C, Y) | CTs]) :-
    split_transitions(Ts, NTs, CTs).

%---------------------------------------------------------------------------
-- %

:- type null_map == map(state, set(state)).

:- pred trans_closure(transitions, null_map, null_map, null_map, null_map).
:- mode trans_closure(in(null_transitions), in, out, in, out) is det.

trans_closure(Ts, Ins0, Ins, Outs0, Outs) :-
    list__foldl2(add_edge, Ts, Ins0, Ins, Outs0, Outs).

%---------------------------------------------------------------------------
-- %

:- pred add_edge(transition, null_map, null_map, null_map, null_map).
:- mode add_edge(in(null_transition), in, out, in, out) is det.

add_edge(null(X, Y), Ins0, Ins, Outs0, Outs) :-
    XInAndX  = set__insert(null_map_lookup(X, Ins0), X),
    YOutAndY = set__insert(null_map_lookup(Y, Outs0), Y),
    Xs = set__to_sorted_list(XInAndX),
    Ys = set__to_sorted_list(YOutAndY),
    Outs = list__foldl(add_to_null_mapping(YOutAndY), Xs, Outs0),
    Ins  = list__foldl(add_to_null_mapping(XInAndX),  Ys, Ins0).

%---------------------------------------------------------------------------
-- %

:- func null_map_lookup(state, null_map) = set(state).

null_map_lookup(X, Map) =
    ( if map__search(Map, X, Ys) then Ys
                                 else set__init
    ).

%---------------------------------------------------------------------------
-- %

:- func add_to_null_mapping(set(state), state, null_map) = null_map.

add_to_null_mapping(Xs, Y, Map) =
    map__set(Map, Y, Xs `set__union` null_map_lookup(Y, Map)).

%---------------------------------------------------------------------------
-- %

:- func add_atom_transitions(null_map, transitions) = transitions.
:- mode add_atom_transitions(in, in(atom_transitions)) =
            out(atom_transitions) is det.

add_atom_transitions(Outs, CTs) =
    list__sort_and_remove_dups(
        list__condense(
            [ CTs
            | list__map(
                add_atom_transitions_0(CTs),
                map__to_assoc_list(Outs)
              )
            ]
        )
    ).

%---------------------------------------------------------------------------
-- %

:- func add_atom_transitions_0(transitions, pair(state, set(state))) =
            transitions.
:- mode add_atom_transitions_0(in(atom_transitions), in) =
            out(atom_transitions) is det.

add_atom_transitions_0(CTs, X - Ys) =
    list__condense(
        list__map(add_atom_transitions_1(CTs, X), set__to_sorted_list(Ys))
    ).

%---------------------------------------------------------------------------
-- %

:- func add_atom_transitions_1(transitions, state, state) = transitions.
:- mode add_atom_transitions_1(in(atom_transitions), in, in) =
            out(atom_transitions) is det.

add_atom_transitions_1(CTs0, X, Y) = CTs :-
    list__filter_map(maybe_copy_transition(X, Y), CTs0, CTs).

%---------------------------------------------------------------------------
-- %

:- pred maybe_copy_transition(state, state, transition, transition).
:- mode
maybe_copy_transition(in,in,in(atom_transition),out(atom_transition))
            is semidet.

maybe_copy_transition(X, Y, trans(Y, C, Z), trans(X, C, Z)).

%---------------------------------------------------------------------------
-- %

:- func nulls_to_stop_state(null_map, set(state), transition) = state.
:- mode nulls_to_stop_state(in, in, in) = out is semidet.

nulls_to_stop_state(Outs, StopStates, null(X, _Y)) = X :-
    some [Z] (
        set__member(Z, map__lookup(Outs, X)),
        set__member(Z, StopStates)
    ).

%---------------------------------------------------------------------------
-- %
%---------------------------------------------------------------------------
-- %




--
Ralph Becket      |      MSR Cambridge      |      rbeck at microsoft.com 
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to:       mercury-developers at cs.mu.oz.au
Administrative Queries: owner-mercury-developers at cs.mu.oz.au
Subscriptions:          mercury-developers-request at cs.mu.oz.au
--------------------------------------------------------------------------



More information about the developers mailing list