[mercury-users] lexer construction

Michael Day mikeday at bigpond.net.au
Thu Feb 14 10:59:02 AEDT 2002


> I'll hack around the lex issue for now and wait for type classes.

As it happens, the fastest way to hack around the issue was to make it use 
type classes. Oh well.

Ralph, perhaps you could take a look at this and use it as the basis for
your version. It will need some review, as I did it rather quickly and I
don't fully understand how lexer_instance / lexer_state / buf / buf_state
interact. Nonetheless, It Works For Me.

Michael
-------------- next part --------------
%------------------------------------------------------------------------------%
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% lex.m
% Copyright (C) 2001 Ralph Becket <rbeck at microsoft.com>
% Sun Aug 20 09:08:46 BST 2000
%   THIS FILE IS HEREBY CONTRIBUTED TO THE MERCURY PROJECT TO
%   BE RELEASED UNDER WHATEVER LICENCE IS DEEMED APPROPRIATE
%   BY THE ADMINISTRATORS OF THE MERCURY PROJECT.
% Thu Jul 26 07:45:47 UTC 2001
% Copyright (C) 2001 The Rationalizer Intelligent Software AG
%   The changes made by Rationalizer are contributed under the terms 
%   of the GNU Lesser General Public License, see the file COPYING.LGPL
%   in this directory.
%
% This module puts everything together, compiling a list of lexemes
% into state machines and turning the input stream into a token stream.
%
%------------------------------------------------------------------------------%

:- module lex.

:- interface.

:- import_module std_util, string, char, list, io.

:- type token_creator(Token)
    ==                        (func(string) = Token).
:- inst token_creator
    ==                        (func(in) = out is det).

:- type lexeme(Token)
    ==                        pair(regexp, token_creator(Token)).

:- inst lexeme(Inst)
    ---> (ground - Inst).

:- type lexer(Token).
:- inst lexer
    ---> lexer(ground, ignore_pred).

:- type lexer_state(Token, Source).

:- type offset
    ==      int.                        % Byte offset into the source data.

    % Any errors should be reported by raising an exception.
    %
:- type read_result
    --->    ok(char)
    ;       eof.

    % read_from(Offset, Result, SrcIn, SrcOut) reads the char at
    % Offset from SrcIn and returns SrcOut.
    %
:- typeclass readable(T) where
    [
        pred read_from(offset, read_result, T, T),
        mode read_from(in, out, di, uo) is det
    ].

    % ignore_pred(Token): if it does not fail, Token must be ignored
    %
:- type ignore_pred(Tok)
    ==      pred(Tok).
:- inst ignore_pred
    ==      ( pred(in) is semidet ).

    % The type of regular expressions.
    %
:- type regexp.

    % The typeclass for types having a natural converter to regexp's
    %
:- typeclass regexp(T) where [
           func re(T) = regexp
].

    % Handling regexp's based on the typeclass regexp(T)
    %
:- func  null      = regexp.
:- func  T1 ++ T2  = regexp  <= (regexp(T1), regexp(T2)).
:- func  *(T)      = regexp  <= (regexp(T)).
    % One of the following two functions may be deprecated
    % in future, depending upon whether there's a concensus
    % concerning which is preferable.  Both express
    % alternation.
    %
:- func  T1 \/ T2  = regexp  <= (regexp(T1), regexp(T2)).
:- func (T1 or T2) = regexp  <= (regexp(T1), regexp(T2)).

    % Some instances of typeclass regexp(T)
    %
:- instance regexp(regexp).
:- instance regexp(char).
:- instance regexp(string).

    % Some basic non-primitive regexps.
    %
:- func any(string) = regexp.        % any("abc") = ('a') or ('b') or ('c')
:- func anybut(string) = regexp.     % anybut("abc") is complement of any("abc")
:- func ?(T) = regexp <= regexp(T).  % ?(R)       = R or null
:- func +(T) = regexp <= regexp(T).  % +(R)       = R ++ *(R)

    % Some useful single-char regexps.
    %
:- func digit = regexp.         % digit      = any("0123456789")
:- func lower = regexp.         % lower      = any("abc...z")
:- func upper = regexp.         % upper      = any("ABC...Z")
:- func alpha = regexp.         % alpha      = lower or upper
:- func alphanum = regexp.      % alphanum   = alpha or digit
:- func identstart = regexp.    % identstart = alpha or "_"
:- func ident = regexp.         % ident      = alphanum or "_"
:- func nl = regexp.            % nl         = re("\n")
:- func tab = regexp.           % tab        = re("\t")
:- func spc = regexp.           % spc        = re(" ")
:- func wspc = regexp.          % wspc       = any(" \t\n\r\f\v")
:- func dot = regexp.           % dot        = anybut("\n")

    % Some useful compound regexps.
    %
:- func nat = regexp.           % nat        = +(digit)
:- func signed_int = regexp.    % signed_int = ?("+" or "-") ++ nat
:- func real = regexp.          % real       = \d+((.\d+([eE]int)?)|[eE]int)
:- func identifier = regexp.    % identifier = identstart ++ *(ident)
:- func whitespace = regexp.    % whitespace = *(wspc)
:- func junk = regexp.          % junk       = *(dot)

   % Utility predicate to create ignore_pred's.
   % Use it in the form `ignore(my_token)' to ignore just `my_token'.
   % 
:- pred ignore(Token::in, Token::in) is semidet.

   % Utility function to return noval tokens.
   % Use it in the form `return(my_token) inside a lexeme definition.
   %
:- func return(T,string) = T.

   % Utility operator to create lexemes.
   %
:- func (T1 -> token_creator(Tok)) = pair(regexp,token_creator(Tok))
            <= regexp(T1).

    % Construct a lexer from which we can generate running
    % instances.
    %
    % NOTE: If several lexemes match the same string only
    % the token generated by the one closest to the start
    % of the list of lexemes is returned.
    %
:- func init(list(lexeme(Tok))) = lexer(Tok).
:- mode init(in) = out(lexer) is det.

    % Construct a lexer from which we can generate running
    % instances. If we construct a lexer with init/4, we
    % can additionally ignore specific tokens.
    %
    % NOTE: If several lexemes match the same string only
    % the token generated by the one closest to the start
    % of the list of lexemes is returned.
    %
:- func init(list(lexeme(Tok)), ignore_pred(Tok)) = lexer(Tok).
:- mode init(in, in(ignore_pred)) = out(lexer) is det.

    % Handy read predicates.
    %
:- instance readable(io).
:- instance readable(string).

    % Generate a running instance of a lexer on some input source.
    % Note that you can't get the input source back until you stop
    % lexing.
    %
:- func start(lexer(Tok), Src) = lexer_state(Tok, Src).
:- mode start(in(lexer), di) = uo is det.

:- pred read(io__read_result(Tok), lexer_state(Tok, Src),
            lexer_state(Tok, Src)) <= readable(Src).
:- mode read(out, di, uo) is det.

    % Stop a running instance of a lexer and retrieve the input source.
    %
:- func stop(lexer_state(_Tok, Src)) = Src.
:- mode stop(di) = uo is det.

    % Sometimes (e.g. when lexing the io__io) you want access to the
    % input stream without interrupting the lexing process.  This pred
    % provides that sort of access.
    %
:- pred manipulate_source(pred(Src, Src),
                lexer_state(Tok, Src), lexer_state(Tok, Src)).
:- mode manipulate_source(pred(di, uo) is det, di, uo) is det.

%------------------------------------------------------------------------------%
%------------------------------------------------------------------------------%

:- implementation.

:- include_module lex__automata.
:- include_module lex__buf.
:- include_module lex__convert_NFA_to_DFA.
:- include_module lex__lexeme.
:- include_module lex__regexp.

:- import_module map, char, bool, int, exception, array.
:- import_module lex__regexp, lex__automata, lex__convert_NFA_to_DFA.
:- import_module lex__lexeme, lex__buf.



:- type lexer(Token)
    --->    lexer(
                lex_compiled_lexemes    :: list(live_lexeme(Token)),
                lex_ignore_pred         :: ignore_pred(Token)
            ).

:- type lexer_instance(Token)
    --->    lexer_instance(
                init_lexemes            :: list(live_lexeme(Token)),
                live_lexemes            :: list(live_lexeme(Token)),
                current_winner          :: winner(Token),
                buf_state               :: buf_state,
                ignore_pred             :: ignore_pred(Token)
            ).

:- inst lexer_instance
    --->    lexer_instance(
                live_lexeme_list, 
                live_lexeme_list, 
                winner, 
                ground,
                ignore_pred
            ).

:- type live_lexeme(Token)
    ==      compiled_lexeme(Token).
:- inst live_lexeme
    ==      compiled_lexeme(token_creator).
:- inst live_lexeme_list
    ==      list__list_skel(live_lexeme).



:- type winner(Token)
    ==      maybe(pair(token_creator(Token), offset)).
:- inst winner
    --->    yes(pair(token_creator, ground))
    ;       no.

%------------------------------------------------------------------------------%

ignore(Tok,Tok).

%------------------------------------------------------------------------------%

return(Token, _) = Token.

%------------------------------------------------------------------------------%

(R1 -> TC) = (re(R1) - TC).

%------------------------------------------------------------------------------%

init(Lexemes) = init(Lexemes, DontIgnoreAnything) :-
    DontIgnoreAnything = ( pred(_::in) is semidet :- semidet_fail ).

init(Lexemes, IgnorePred) =
    lexer(CompiledLexemes, IgnorePred)
 :-
    CompiledLexemes = list__map(compile_lexeme, Lexemes).

%------------------------------------------------------------------------------%

start(Lexer, Src) = LexerState :-
    init_lexer_instance(Lexer, Instance, Buf),
    LexerState = args_lexer_state(Instance, Buf, Src).

%------------------------------------------------------------------------------%

:- pred init_lexer_instance(lexer(Tok), lexer_instance(Tok), buf).
:- mode init_lexer_instance(in(lexer), out(lexer_instance), array_uo) is det.

init_lexer_instance(Lexer, Instance, Buf) :-
    buf__init(BufState, Buf),
    InitLexemes = Lexer ^ lex_compiled_lexemes,
    IgnorePred  = Lexer ^ lex_ignore_pred,
    Instance    = lexer_instance(InitLexemes, InitLexemes, no,
                        BufState, IgnorePred).

%------------------------------------------------------------------------------%

stop(LexerState) = Src :-
    lexer_state_args(LexerState, _Instance, _Buf, Src).

%------------------------------------------------------------------------------%

read(Result, LexerState0, LexerState) :-
    lexer_state_args(LexerState0, Instance0, Buf0, Src0),
    read_0(Result, Instance0, Instance, Buf0, Buf, Src0, Src),
    LexerState = args_lexer_state(Instance, Buf, Src).



:- pred read_0(io__read_result(Tok),
            lexer_instance(Tok), lexer_instance(Tok),
            buf, buf, Src, Src) <= readable(Src).
:- mode read_0(out,
            in(lexer_instance), out(lexer_instance),
            array_di, array_uo, di, uo) is det.

    % Basically, just read chars from the buf and advance the live lexemes
    % until we have a winner or hit an error (no parse).
    %
read_0(Result, Instance0, Instance, Buf0, Buf, Src0, Src) :-

    BufState0    = Instance0 ^ buf_state,

    buf__read(BufReadResult, BufState0, BufState1, Buf0, Buf1, Src0, Src1),
    (
        BufReadResult = ok(Char),
        process_char(Result, Char,
                Instance0, Instance, BufState1, Buf1, Buf, Src1, Src)
    ;
        BufReadResult = eof,
        Buf = Buf1,
        Src = Src1,
        process_eof(Result, Instance0, Instance, BufState1, Buf)
    ).

%------------------------------------------------------------------------------%

:- pred process_char(io__read_result(Tok), char,
            lexer_instance(Tok), lexer_instance(Tok),
            buf_state, buf, buf, Src, Src) <= readable(Src).
:- mode process_char(out, in, in(lexer_instance), out(lexer_instance),
            in, array_di, array_uo, di, uo) is det.

process_char(Result, Char, Instance0, Instance,
        BufState, Buf0, Buf, Src0, Src) :-

    LiveLexemes0 = Instance0 ^ live_lexemes,
    Winner0      = Instance0 ^ current_winner,

    advance_live_lexemes(Char, buf__cursor_offset(BufState),
            LiveLexemes0, LiveLexemes, Winner0, Winner),
    (
        LiveLexemes = [],               % Nothing left to consider.

        process_any_winner(Result, Winner, Instance0, Instance, BufState,
                Buf0, Buf, Src0, Src)
    ;
        LiveLexemes = [_ | _],          % Still some open possibilities.

        Instance1 = (((Instance0
                            ^ live_lexemes   := LiveLexemes)
                            ^ current_winner := Winner)
                            ^ buf_state      := BufState),
        read_0(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
    ).

%------------------------------------------------------------------------------%

:- pred process_any_winner(io__read_result(Tok), winner(Tok),
            lexer_instance(Tok), lexer_instance(Tok), 
            buf_state, buf, buf, Src, Src) <= readable(Src).
:- mode process_any_winner(out, in(winner),
            in(lexer_instance), out(lexer_instance),
            in, array_di, array_uo, di, uo) is det.

process_any_winner(Result, yes(TokenCreator - Offset), Instance0, Instance,
        BufState0, Buf0, Buf, Src0, Src) :-

    BufState1 = buf__rewind_cursor(Offset, BufState0),
    Instance1 = ((( Instance0
                        ^ live_lexemes   := Instance0 ^ init_lexemes)
                        ^ current_winner := no)
                        ^ buf_state      := buf__commit(BufState1)),
    ( if

         get_token_from_buffer(BufState1, Buf0, Instance0, TokenCreator, Token)

      then
    
         Result   = ok(Token),
         Instance = Instance1,
         Buf      = Buf0,
         Src      = Src0
    
      else
    
         read_0(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
    ).

process_any_winner(Result, no, Instance0, Instance,
        BufState0, Buf, Buf, Src, Src) :-

    Start     = buf__start_offset(BufState0),
    BufState1 = buf__rewind_cursor(Start + 1, BufState0),
    Result    = error("input not matched by any regexp", Start),
    Instance  = ((( Instance0
                        ^ live_lexemes   :=
                                Instance0 ^ init_lexemes)
                        ^ current_winner := no)
                        ^ buf_state      := buf__commit(BufState1)).

%------------------------------------------------------------------------------%

:- pred process_eof(io__read_result(Tok),
            lexer_instance(Tok), lexer_instance(Tok),
            buf_state, buf).
:- mode process_eof(out, in(lexer_instance), out(lexer_instance),
            in, array_ui) is det.

process_eof(Result, Instance0, Instance, BufState, Buf) :-

    Result   =
        ( if
            live_lexeme_in_accepting_state(Instance0 ^ live_lexemes,
                        TokenCreator),
            get_token_from_buffer(BufState, Buf, Instance0,
                        TokenCreator, Token)
          then ok(Token)
          else eof
        ),
    Instance = ((Instance0
                        ^ live_lexemes := [])
                        ^ buf_state    := buf__commit(BufState)).

%------------------------------------------------------------------------------%

:- pred get_token_from_buffer(buf_state, buf, lexer_instance(Tok),
                  token_creator(Tok), Tok).
:- mode get_token_from_buffer(in, array_ui, in(lexer_instance),
                  in(token_creator), out) is semidet.

get_token_from_buffer(BufState, Buf, Instance, TokenCreator, Token) :-
    Match      = buf__string_to_cursor(BufState, Buf),
    Token      = TokenCreator(Match),
    IgnorePred = Instance ^ ignore_pred,
    not IgnorePred(Token).

%------------------------------------------------------------------------------%

    % Note that in the case where two or more lexemes match the same
    % string, the win is given to the earliest such lexeme in the list.
    % This matches the behaviour of standard C lex.
    %
:- pred advance_live_lexemes(char, offset,
            list(live_lexeme(Token)), list(live_lexeme(Token)),
            winner(Token), winner(Token)).
:- mode advance_live_lexemes(in, in, in(live_lexeme_list), 
            out(live_lexeme_list), 
            in(winner), out(winner)) is det.

advance_live_lexemes(_Char, _Offset, [], [], Winner, Winner).

advance_live_lexemes(Char, Offset, [L0 | Ls0], Ls, Winner0, Winner) :-

    State0        = L0 ^ state,
    ATok          = L0 ^ token,

    ( if next_state(L0, State0, Char, State, IsAccepting) then

        (
            IsAccepting = no,
            Winner1     = Winner0
        ;
            IsAccepting = yes,
            Winner1     = ( if   Winner0 = yes(_ATok0 - Offset0),
                                 Offset  = Offset0
                            then Winner0
                            else yes(ATok - Offset)
                          )
        ),
        advance_live_lexemes(Char, Offset, Ls0, Ls1, Winner1, Winner),
        Ls = [( L0 ^ state := State ) | Ls1]

      else

        advance_live_lexemes(Char, Offset, Ls0, Ls, Winner0, Winner)
    ).

%------------------------------------------------------------------------------%

:- pred live_lexeme_in_accepting_state(list(live_lexeme(Tok)),
                token_creator(Tok)).
:- mode live_lexeme_in_accepting_state(in(live_lexeme_list), 
                out(token_creator)) is semidet.

live_lexeme_in_accepting_state([L | Ls], Token) :-
    ( if   in_accepting_state(L)
      then Token = L ^ token
      else live_lexeme_in_accepting_state(Ls, Token)
    ).


%------------------------------------------------------------------------------%
%------------------------------------------------------------------------------%

    % It's much more convenient (especially for integration with, e.g.
    % parsers such as moose) to package up the lexer_instance, buf
    % and Src in a single object.

:- type lexer_state(Tok, Src)
    --->    lexer_state(
                run                     :: lexer_instance(Tok),
                buf                     :: buf,
                src                     :: Src
            ).

%------------------------------------------------------------------------------%

:- func args_lexer_state(lexer_instance(Tok), buf, Src) =
            lexer_state(Tok, Src).
:- mode args_lexer_state(in(lexer_instance), array_di, di) = uo is det.

args_lexer_state(Instance, Buf, Src) = LexerState :-
    unsafe_promise_unique(lexer_state(Instance, Buf, Src), LexerState).

%------------------------------------------------------------------------------%

:- pred lexer_state_args(lexer_state(Tok,Src),lexer_instance(Tok),buf,Src).
:- mode lexer_state_args(di, out(lexer_instance), array_uo, uo)  is det.

lexer_state_args(lexer_state(Instance, Buf0, Src0), Instance, Buf, Src) :-
    unsafe_promise_unique(Buf0, Buf),
    unsafe_promise_unique(Src0, Src).

%------------------------------------------------------------------------------%

manipulate_source(P, State0, State) :-
    lexer_state_args(State0, Instance, Buf, Src0),
    P(Src0, Src),
    State = args_lexer_state(Instance, Buf, Src).

%------------------------------------------------------------------------------%

:- instance readable(io) where
    [
        (read_from(_Offset, Result) -->
            io__read_char(IOResult),
            {   IOResult = ok(Char),              Result = ok(Char)
            ;   IOResult = eof,                   Result = eof
            ;   IOResult = error(_E),             throw(IOResult)
            }
        )
    ].

%------------------------------------------------------------------------------%

:- instance readable(string) where
    [
        (read_from(Offset, Result, String0, String) :-
            ( if Offset < string__length(String0)
            then Result = ok(string__unsafe_index(String0, Offset))
            else Result = eof
            ),
            unsafe_promise_unique(String0, String)
        )
    ].

%------------------------------------------------------------------------------%
% The type of regular expressions.

:- type regexp
    --->    eps                    % The empty regexp
    ;       atom(char)             % Match a single char
    ;       conc(regexp,regexp)    % Concatenation
    ;       alt(regexp, regexp)    % Alternation
    ;       star(regexp).          % Kleene closure

%------------------------------------------------------------------------------%

:- instance regexp(regexp) where [
    re(RE) = RE
].

:- instance regexp(char) where [
    re(C) = atom(C)
].

:- instance regexp(string) where [
    re(S) =  R :-
        ( if S = "" then
            R = null
          else
            L = string__length(S),
            C = string__index_det(S, L - 1),
            R = str_foldr(func(Cx, Rx) = (Cx ++ Rx), S, re(C), L - 2)
        )
].

%------------------------------------------------------------------------------%
% Basic primitive regexps.

 null      = eps.
 R1 ++ R2  = conc(re(R1), re(R2)).
 R1 \/ R2  = alt(re(R1), re(R2)).
(R1 or R2) = alt(re(R1), re(R2)).
 *(R1)     = star(re(R1)).

%------------------------------------------------------------------------------%
% Some basic non-primitive regexps.

any(S) = R :-
    ( if S = "" then
        R = null
      else
        L = string__length(S),
        C = string__index_det(S, L - 1),
        R = str_foldr(func(Cx, Rx) = (Cx or Rx), S, re(C), L - 2)
    ).

anybut(S0) = R :-
    S = string__from_char_list(
            list__filter_map(
                ( func(X) = C is semidet :-
                    char__to_int(C, X),
                    not string__contains_char(S0, C)
                ),
                0x01 `..` 0xff
            )
        ),
    R = any(S).

:- func str_foldr(func(char, T) = T, string, T, int) = T.

str_foldr(Fn, S, X, I) =
    ( if I < 0 then X
               else str_foldr(Fn, S, Fn(string__index_det(S, I), X), I - 1)
    ).

?(R) = (R or null).

+(R) = (R ++ *(R)).

%------------------------------------------------------------------------------%
% Some useful single-char regexps.

    % We invite the compiler to memo the values of these constants that
    % (a) are likely to be quite common in practice and (b) take *some*
    % time to compute.
    %
:- pragma memo(digit/0).
:- pragma memo(lower/0).
:- pragma memo(upper/0).
:- pragma memo(wspc/0).
:- pragma memo(dot/0).

digit      = any("0123456789").
lower      = any("abcdefghijklmnopqrstuvwxyz").
upper      = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
wspc       = any(" \t\n\r\f\v").
dot        = anybut("\n").
alpha      = (lower or upper).
alphanum   = (alpha or digit).
identstart = (alpha or ('_')).
ident      = (alphanum or ('_')).
nl         = re('\n').
tab        = re('\t').
spc        = re(' ').

%------------------------------------------------------------------------------%
% Some useful compound regexps.

nat        = +(digit).
signed_int = ?("+" or "-") ++ nat.
real       = signed_int ++ (
                ("." ++ nat ++ ?(("e" or "E") ++ signed_int)) or
                (                ("e" or "E") ++ signed_int)
             ).
identifier = (identstart ++ *(ident)).
whitespace = *(wspc).
junk       = *(dot).

%------------------------------------------------------------------------------%
%------------------------------------------------------------------------------%
-------------- next part --------------
% -----------------------------------------------------------------------------%
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% lex.buf.m
% Copyright (C) 2001 Ralph Becket <rbeck at microsoft.com>
% Sat Aug 19 16:56:30 BST 2000
%
%   THIS FILE IS HEREBY CONTRIBUTED TO THE MERCURY PROJECT TO
%   BE RELEASED UNDER WHATEVER LICENCE IS DEEMED APPROPRIATE
%   BY THE ADMINISTRATORS OF THE MERCURY PROJECT.
%
% This module implements the rolling char buffer.  The char
% buffer is optimised for efficiency.
%
% The buffer stores chars read from an input source (e.g. IO
% or string).  Because the lexer can want to `unread' chars
% (when a long candidate lexeme fails), the buffer may
% contain `read ahead' chars.  The structure of the buffer
% is as follows:
%
%    buf[0]                                       buf[len]
%    |                  len = end - start                |
%    v                                                   v
%   +---------------------------------------------------+
%   |.|.|.|.|.|a|b|c|d|e|f|g|h|i|j|k|l| | | | | | | | | |
%   +---------------------------------------------------+
%    ^         ^           ^           ^                 ^ 
%    |         |           |           |                 |
%    origin    start       cursor      end        terminus
%
% origin, start etc. are all recorded in terms of offsets
% (number of chars) from the start of the input stream,
% counting the first char read as being at offset 1.  Hence,
% the char at the cursor is at buf[cursor - origin].
%
% READING CHARS
%
% * In the diagram, `g' is the next char that will be read.
%
% Thu cursor marks the point of the next char to be read in.
%
% If the cursor advances to the end, then a new char is read
% from the input and inserted into the buffer at the end and
% the end marker is incremented.
%
% If the end marker advances to the terminus, then the
% buffer is extended and the terminus adjusted
% appropriately.  The buffer may take this opportunity to
% garbage collect the inaccessible chars between the origin
% and the start marker.
%
% EOF
%
% * In the diagram, if EOF had been detected then the end
% marker would give the offset at which it occurred.
%
% When EOF is read from the input stream, a special eof flag
% is set (and the end marker, of course, will give its offset).
% Any attempt to read at or past this point will cause the
% buffer to return the EOF signal.
%
% REWINDING
%
% * In the diagram, the cursor may be rewound to any point
% between the start marker and itself, inclusive.
%
% At any point, the cursor may be reset to any point between
% itself and the start marker inclusive.
%
%
% At any point, the user may ask for the offset of the cursor.
%
% STRING EXTRACTION
%
% * In the diagram, the string read in so far is "abcdef".
%
% The buffer provides a facility to return the substring
% consisting of the chars between the start marker and up
% to, but not including, that under the cursor.
%
% COMMITS
%
% * In the diagram, a commit will move the start marker to
% be the same as the cursor.
%
% The user can issue a commit order to the buffer which
% moves the start pointer to where the cursor is, preventing
% rewinds back past this point.  This is important since it
% means that the region prior to the cursor in the buffer is
% now available for garbage collection.
%
% -----------------------------------------------------------------------------%

:- module lex__buf.

:- interface.

:- import_module array, char, bool, string.



    % XXX We need a char and/or byte array datatype;
    % array(char) uses one word for each char, which is
    % rather wasteful.
    %
:- type buf
    ==      array(char).

    % T is the type of the input source (typically io__state or string);
    % the user must initialise the buffer by specifying an appropriate
    % read predicate.
    %
:- type buf_state
    --->    buf_state(
                buf_origin                  :: offset,
                buf_start                   :: offset,
                buf_cursor                  :: offset,
                buf_end                     :: offset,
                buf_terminus                :: offset,
                buf_eof_seen                :: bool    % If `yes' then buf_end
                                                        % has the offset
            ).

    % Returns an empty buffer and an initialised buf_state.
    %
:- pred init(buf_state, buf).
:- mode init(out, array_uo) is det.

    % Reads the next char and advances the cursor.  Updates the
    % buf_state, the buf and the input.
    %
:- pred read(read_result, buf_state, buf_state, buf, buf, T, T) <= readable(T).
:- mode read(out, in, out, array_di, array_uo, di, uo) is det.

    % Returns the offset of the start marker.
    %
:- func start_offset(buf_state) = offset.

    % Returns the offset of the cursor.
    %
:- func cursor_offset(buf_state) = offset.

    % Rewinds the buffer.  An exception is raised if the offset provided
    % is not legitimate.
    %
:- func rewind_cursor(offset, buf_state) = buf_state.

    % Extracts the string of chars between the start and cursor.
    %
:- func string_to_cursor(buf_state, buf) = string.
:- mode string_to_cursor(in, array_ui) = out is det.

    % Advances the start marker to the cursor.  Rewinds past the
    % cursor will therefore no longer be possible.
    %
:- func commit(buf_state) = buf_state.

% -----------------------------------------------------------------------------%
% -----------------------------------------------------------------------------%

:- implementation.

:- import_module exception.


    % The amount the buffer is grown by if (a) more space is
    % required and (b) the available space is smaller than
    % this amount.
    %
:- func low_water_mark = int.
low_water_mark = 256.

:- func initial_buf_size = int.
initial_buf_size = 1024.

    % XXX Debugging values.
    % %
% :- func low_water_mark = int.
% low_water_mark = 16.
% 
% :- func initial_buf_size = int.
% initial_buf_size = 32.

% -----------------------------------------------------------------------------%

init(BufState, Buf) :-
    BufState = buf_state(0, 0, 0, 0, initial_buf_size, no),
    Buf      = array__init(initial_buf_size, ('@')).

% -----------------------------------------------------------------------------%

read(Result, BufState0, BufState, Buf0, Buf, Src0, Src) :-

    Origin   = BufState0 ^ buf_origin,
    Start    = BufState0 ^ buf_start,
    Cursor   = BufState0 ^ buf_cursor,
    End      = BufState0 ^ buf_end,
    Terminus = BufState0 ^ buf_terminus,
    EOFSeen  = BufState0 ^ buf_eof_seen,

    ( if Cursor < End then

        Result   = ok(array__lookup(Buf0, Cursor - Origin)),
        BufState = ( BufState0 ^ buf_cursor := Cursor + 1 ),
        Buf      = Buf0,
        Src      = Src0

      else /* Cursor = End */ if EOFSeen = yes then

        Result   = eof,
        BufState = BufState0,
        Buf      = Buf0,
        Src      = Src0

      else if End < Terminus then

        read_from(Cursor, Result, Src0, Src),

        ( if Result = ok(Char) then

            Buf = array__set(Buf0, End - Origin, Char),
            BufState = (( BufState0
                                ^ buf_cursor := Cursor + 1 )
                                ^ buf_end    := End + 1 )
          else

            Buf      = Buf0,
            BufState = BufState0
        )

      else /* Need to GC and/or extend the buffer */

        GarbageLength = Start - Origin,
        adjust_buf(GarbageLength, ExtraLength, Buf0, Buf1),
        NewOrigin     = Origin + GarbageLength,
        NewTerminus   = Terminus + GarbageLength + ExtraLength,
        BufState1     = (( BufState0
                                ^ buf_origin   := NewOrigin )
                                ^ buf_terminus := NewTerminus ),
        read(Result, BufState1, BufState, Buf1, Buf, Src0, Src)
    ).

% -----------------------------------------------------------------------------%

    % Garbage collects the chars between the origin and start and
    % extends the buffer if the remaining space is below the low
    % water mark.
    %
:- pred adjust_buf(int, int, buf, buf).
:- mode adjust_buf(in, out, array_di, array_uo) is det.

adjust_buf(GarbageLength, ExtraLength, Buf0, Buf) :-

    Size0 = array__size(Buf0),

    ( if GarbageLength < low_water_mark then /* We need to grow the buffer */
        array__init(Size0 + low_water_mark, ('@'), Buf1),
        ExtraLength = low_water_mark
      else
        Buf1 = Buf0,
        ExtraLength = 0
    ),

    Buf = shift_buf(0, Size0 - GarbageLength, GarbageLength, Buf0, Buf1).

% -----------------------------------------------------------------------------%

:- func shift_buf(int, int, int, buf, buf) = buf.
:- mode shift_buf(in, in, in, array_ui, array_di) = array_uo is det.

shift_buf(I, Hi, Disp, Src, Tgt) =
    ( if I < Hi then
        shift_buf(I + 1, Hi, Disp, Src,
            array__set(Tgt, I, array__lookup(Src, I + Disp)))
      else
        Tgt
    ).

% -----------------------------------------------------------------------------%

start_offset(BufState) = BufState ^ buf_start.

% -----------------------------------------------------------------------------%

cursor_offset(BufState) = BufState ^ buf_cursor.

% -----------------------------------------------------------------------------%

rewind_cursor(Offset, BufState) =
    ( if ( Offset < BufState ^ buf_start ; BufState ^ buf_cursor < Offset ) then
        throw("buf: rewind/2: offset arg outside valid range")
      else
        BufState ^ buf_cursor := Offset
    ).

% -----------------------------------------------------------------------------%

string_to_cursor(BufState, Buf) = String :-
    From   = BufState ^ buf_start - BufState ^ buf_origin,
    Length = (BufState ^ buf_cursor - 1 - BufState ^ buf_start),
    To     = From + Length,
    String = string__from_char_list(array__fetch_items(Buf, From, To)).

% -----------------------------------------------------------------------------%

commit(BufState) = ( BufState ^ buf_start := BufState ^ buf_cursor ).

% -----------------------------------------------------------------------------%
% -----------------------------------------------------------------------------%


More information about the users mailing list