[m-dev.] For review: hash table implementation

Ralph Becket rbeck at microsoft.com
Fri Feb 2 07:20:46 AEDT 2001


I *hate* those stupid stupid stupid logical bugs.  But it works
now.

Here is an updated version with the requested changes made and suggested
optimizations applied and the extra functionality supplied.

This has raised the ugly problem of non-simple modes preventing good
things, but that's a post for tomorrow...

Ralph



%
----------------------------------------------------------------------------
%
% hash_table.m
% Ralph Becket <rbeck at microsoft.com>
% Tue Jan 30 15:21:45 GMT 2001
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix ft=mercury
%
% Hash table implementation.
%
% This implementation uses double hashing and requires the user to
% supply a predicate that will compute two independent hash values
% for any given key.
%
% The number of buckets in the hash table is always a power of 2.
%
% When a user set occupancy level is achieved, the number of buckets
% in the table is doubled and the previous contents reinserted into
% the new hash table.
%
%
----------------------------------------------------------------------------
%

:- module hash_table.

:- interface.

:- import_module int, assoc_list, float, string, array, bitmap.



:- type hash_table(K, V).

    % XXX This is all fake until the compiler can handle nested unique
modes.
    %
:- inst hash_table ==
            bound(ht(ground, ground, ground, hash_pred, bitmap, array,
array)).
:- mode hash_table_ui == in(hash_table).
:- mode hash_table_di == di(hash_table).
:- mode hash_table_uo == out(hash_table).

:- type hash_pred(K) == ( pred(K, int, int) ).
:- inst hash_pred    == ( pred(in, out, out) is det ).

    % new(HashPred, N, MaxOccupancy)
    % constructs a new hash table with initial size 2 ^ N that is
    % doubled whenever MaxOccupancy is achieved; elements are
    % indexed using HashPred.
    %
    % N must be greater than 1.
    % MaxOccupancy must be in (0.0, 1.0).
    %
:- func new(hash_pred(K), int, float) = hash_table(K, V).
:- mode new(in(hash_pred), in, in) = hash_table_uo is det.

    % new_default(HashFn) constructs a hash table with default size and
    % occupancy arguments.
    %
:- func new_default(hash_pred(K)) = hash_table(K, V).
:- mode new_default(in(hash_pred)) = hash_table_uo is det.

    % Retrieve the hash_pred associated with a hash table.
    %
:- func hash_pred(hash_table(K, V)) = hash_pred(K).
:- mode hash_pred(hash_table_ui) = out(hash_pred) is det.

    % Default hash_preds for ints and strings.
    %
:- func int_hash_pred = hash_pred(int).
:- mode int_hash_pred = out(hash_pred) is det.

:- func string_hash_pred = hash_pred(string).
:- mode string_hash_pred = out(hash_pred) is det.

    % Returns the number of buckets in a hash table.
    %
:- func num_buckets(hash_table(K, V)) = int.
:- mode num_buckets(hash_table_ui) = out is det.
%:- mode num_buckets(in) = out is det.

    % Returns the number of occupants in a hash table.
    %
:- func num_occupants(hash_table(K, V)) = int.
:- mode num_occupants(hash_table_ui) = out is det.
%:- mode num_occupants(in) = out is det.

    % Insert key-value binding into a hash table; if one is
    % already there then the previous value is overwritten.
    %
:- func set(hash_table(K, V), K, V) = hash_table(K, V).
:- mode set(hash_table_di, in, in) = hash_table_uo is det.

    % Field update for hash tables.
    % HT ^ elem(K) := V  is equivalent to  set(HT, K, V).
    %
:- func 'elem :='(K, hash_table(K, V), V) = hash_table(K, V).
:- mode 'elem :='(in, hash_table_di, in) = hash_table_uo is det.

    % Insert a key-value binding into a hash table.  An
    % exception is thrown if a binding for the key is already
    % present.
    %
:- func det_insert(hash_table(K, V), K, V) = hash_table(K, V).
:- mode det_insert(hash_table_di, in, in) = hash_table_uo is det.

    % Change a key-value binding in a hash table.  An
    % exception is thrown if a binding for the key does not
    % already exist.
    %
:- func det_update(hash_table(K, V), K, V) = hash_table(K, V).
:- mode det_update(hash_table_di, in, in) = hash_table_uo is det.

    % Delete the entry for the given key, leaving the hash table
    % unchanged if there is no such entry.
    %
:- func delete(hash_table(K, V), K) = hash_table(K, V).
:- mode delete(hash_table_di, in) = hash_table_uo is det.

    % Lookup the value associated with the given key.  An exception
    % is raised if there is no entry for the key.
    %
:- func lookup(hash_table(K, V), K) = V.
:- mode lookup(hash_table_ui, in) = out is det.
%:- mode lookup(in, in) = out is det.

    % Field access for hash tables.
    % HT ^ elem(K)  is equivalent to  lookup(HT, K).
    %
:- func elem(K, hash_table(K, V)) = V.
:- mode elem(in, hash_table_ui) = out is det.
%:- mode elem(in, in) = out is det.

    % Like lookup, but just fails if there is no entry for the key.
    %
:- pred search(hash_table(K, V), K, V).
:- mode search(hash_table_ui, in, out) is semidet.
%:- mode search(in, in, out) is semidet.

    % Convert a hash table into an association list.
    %
:- func to_assoc_list(hash_table(K, V)) = assoc_list(K, V).
:- mode to_assoc_list(hash_table_ui) = out is det.
%:- mode to_assoc_list(in) = out is det.

    % Fold a function over the key-value bindings in a hash table.
    %
:- func fold(func(K, V, T) = T, hash_table(K, V), T) = T.
:- mode fold(func(in, in, in) = out is det, hash_table_ui, in) = out is det.
:- mode fold(func(in, in, di) = uo is det, hash_table_ui, di) = uo is det.

%
----------------------------------------------------------------------------
%
%
----------------------------------------------------------------------------
%

:- implementation.

:- import_module exception, math, bool, exception, char, list, std_util.



:- type hash_table(K, V) 
    --->    ht(
                num_buckets             :: int,
                num_occupants           :: int,
                max_occupants           :: int,
                hash_pred               :: hash_pred(K),
                bitmap                  :: bitmap,
                keys                    :: array(K),
                values                  :: array(V)
            ).

%
----------------------------------------------------------------------------
%

    % THE HASHING SCHEME
    %
    % The user provided hashing function computes two independent
    % hashes H1 and H2 for a given key K.
    %
    % We calculate D = 2*H2 + 1 (to ensure that D is non-zero and
    % odd and is therefore coprime to the number of buckets) and
    % probe the table where the Kth probe examines slot
    %
    %   (H1 + K * H2) `rem` num_buckets
    %
    % The search is guaranteed to terminate because table occupancy
    % must be less than 1.0.
    %
    % The bitmap keeps track of which slots are occupied.
    %
    % If a slot is occupied, then the corresponding element in
    % the keys array is used to decide whether this probe has
    % found the right slot.
    %
    % Once the right slot has been found, the value can be found
    % in the corresponding element in the values array.
    %
    % The keys and values arrays cannot be initialised until at
    % least one binding is inserted.  This is handled by checking
    % to see whether the keys array has non-zero size.

%
----------------------------------------------------------------------------
%

    % XXX Need to check that N is not too great, either.
    %
new(HashPred, N, MaxOccupancy) = HT :-
    (      if N =< 1 then
            throw("hash_table__new_hash_table: N =< 1")
      else if MaxOccupancy =< 0.0 ; 1.0 =< MaxOccupancy  then
            throw("hash_table__new_hash_table: MaxOccupancy not in (0.0,
1.0)")
      else
            NumBuckets   = 1 << N,
            MaxOccupants = ceiling_to_int(float(NumBuckets) * MaxOccupancy),
            Bitmap       = bitmap__new(NumBuckets, no),
            Keys         = array__make_empty_array,
            Values       = array__make_empty_array,
            HT = ht(NumBuckets, 0, MaxOccupants, HashPred, Bitmap, Keys,
Values)
    ).

%
----------------------------------------------------------------------------
%

    % These numbers are picked out of thin air.
    %
new_default(HashPred) = new(HashPred, 7, 0.9).

%
----------------------------------------------------------------------------
%

    % find_slot(HT, K) looks up key K in hash table HT and
    % returns the index for the entry K in H.  This is either the
    % first free slot identified (K is not in the table, but here
    % is where it would go) or the slot for K (K is in the table
    % and this is its slot).
    %
    % Whether or not K is actually in the table can be decided
    % by checking to see whether its bit in the bitmap is set
    % or clear.
    %
:- func find_slot(hash_table(K, V), K) = int.
:- mode find_slot(hash_table_ui, in) = out is det.
%:- mode find_slot(in, in) = out is det.

find_slot(HT, K) = H :-
    (HT ^ hash_pred)(K, Hash1a, Hash2a),
    int__abs(Hash1a, Hash1),
    int__abs(Hash2a, Hash2),
    H0    = Hash1 `rem` HT ^ num_buckets,
    Delta = Hash2 + Hash2 + 1,          % Have to ensure it's odd and
non-zero.
    H     = find_slot_2(HT, K, H0, Delta).



:- func find_slot_2(hash_table(K, V), K, int, int) = int.
:- mode find_slot_2(hash_table_ui, in, in, in) = out is det.
%:- mode find_slot_2(in, in, in, in) = out is det.

find_slot_2(HT, K, H0, Delta) = H :-
    ( if bitmap__is_clear(HT ^ bitmap, H0) then
        H  = H0
      else if HT ^ keys ^ elem(H0) = K then
        H  = H0
      else
        H1 = (H0 + Delta) `rem` HT ^ num_buckets,
        H  = find_slot_2(HT, K, H1, Delta)
    ).

%
----------------------------------------------------------------------------
%

set(HT0, K, V) = HT :-

        % If this is the first entry in the hash table, then we use it to
        % set up the hash table (the arrays are currently empty because we
        % need values to initialise them with).
        %
    ( if array__size(HT0 ^ values) = 0 then
        HT = set(
                (( HT0
                        ^ keys   := array__init(HT0 ^ num_buckets, K) )
                        ^ values := array__init(HT0 ^ num_buckets, V) ),
                K, V
             )
      else
        H = find_slot(HT0, K),
        ( if bitmap__is_set(HT0 ^ bitmap, H) then
            HT = ( HT0 ^ values ^ elem(H) := V )
          else
            HT =
                ( if HT0 ^ num_occupants = HT0 ^ max_occupants then
                    set(expand(HT0), K, V)
                  else
                    (((( HT0
                            ^ num_occupants    := HT0 ^ num_occupants + 1 )
                            ^ bitmap           := bitmap__set(HT0 ^ bitmap,
H) )
                            ^ keys ^ elem(H)   := K )
                            ^ values ^ elem(H) := V )
                )
        )
    ).

'elem :='(K, HT, V) = set(HT, K, V).

%
----------------------------------------------------------------------------
%

search(HT, K, V) :-
    H = find_slot(HT, K),
    bitmap__is_set(HT ^ bitmap, H),
    V = HT ^ values ^ elem(H).

%
----------------------------------------------------------------------------
%

det_insert(HT, K, V) =
    ( if bitmap__is_set(HT ^ bitmap, H) then
        throw("hash_table__det_insert: key already present")
      else if HT ^ num_occupants = HT ^ max_occupants then
        set(expand(HT), K, V)
      else
        (((( HT
                ^ num_occupants    := HT ^ num_occupants + 1 )
                ^ bitmap           := bitmap__set(HT ^ bitmap, H) )
                ^ keys ^ elem(H)   := K )
                ^ values ^ elem(H) := V )
    )
 :-
    H = find_slot(HT, K).

%
----------------------------------------------------------------------------
%

det_update(HT, K, V) =
    ( if bitmap__is_clear(HT ^ bitmap, H) then
        throw("hash_table__det_update: key not found")
      else
        HT ^ values ^ elem(H) := V
    )
 :-
    H = find_slot(HT, K).

%
----------------------------------------------------------------------------
%

lookup(HT, K) =
    ( if search(HT, K, V)
      then V
      else throw("hash_table__lookup: key not found")
    ).

elem(K, HT) = lookup(HT, K).

%
----------------------------------------------------------------------------
%

delete(HT, K) =
    HT ^ bitmap := bitmap__clear(HT ^ bitmap, find_slot(HT, K)).

%
----------------------------------------------------------------------------
%

to_assoc_list(HT) = to_assoc_list_2(0, HT, []).



:- func to_assoc_list_2(int,hash_table(K,V),assoc_list(K,V)) =
assoc_list(K,V).
:- mode to_assoc_list_2(in, hash_table_ui, in) = out is det.
%:- mode to_assoc_list_2(in, in, in) = out is det.

to_assoc_list_2(I, HT, AList) =
    ( if I >= HT ^ num_buckets then
        AList
      else if bitmap__is_clear(HT ^ bitmap, I) then
        to_assoc_list_2(I + 1, HT, AList)
      else
        to_assoc_list_2(I + 1, HT,
                [(HT ^ keys ^ elem(I)) - (HT ^ values ^ elem(I)) | AList])
    ).

%
----------------------------------------------------------------------------
%

    % Hash tables expand by doubling in size.
    %
:- func expand(hash_table(K, V)) = hash_table(K, V).
:- mode expand(hash_table_di) = hash_table_uo is det.

expand(HT0) = HT :-

    HT0 = ht(NBs0, _NOs, MOs0, HP, BM0, Ks0, Vs0),

    NBs = NBs0 + NBs0,
    MOs = MOs0 + MOs0,
    BM  = bitmap__new(NBs, no),
    Ks  = array__make_empty_array,
    Vs  = array__make_empty_array,

    HT1 = ht(NBs, 0, MOs, HP, BM, Ks, Vs),

    HT  = reinsert_bindings(0, NBs0, BM0, Ks0, Vs0, HT1).

%
----------------------------------------------------------------------------
%

:- func reinsert_bindings(int, int, bitmap, array(K), array(V),
            hash_table(K, V)) = hash_table(K, V).
:- mode reinsert_bindings(in, in, bitmap_ui, array_ui, array_ui,
            hash_table_di) = hash_table_uo is det.

reinsert_bindings(I, NumBuckets, Bitmap, Keys, Values, HT) =
    ( if I >= NumBuckets then
        HT
      else if bitmap__is_clear(Bitmap, I) then
        reinsert_bindings(I + 1, NumBuckets, Bitmap, Keys, Values, HT)
      else
        reinsert_bindings(I + 1, NumBuckets, Bitmap, Keys, Values,
                set(HT, Keys ^ elem(I), Values ^ elem(I)))
    ).

%
----------------------------------------------------------------------------
%

    % There are almost certainly better ones out there...
    %
int_hash_pred =
    ( pred(N::in, H1::out, H2::out) is det :-
        H1 = N * N,
        H2 = N `xor` (N + N)
    ).

%
----------------------------------------------------------------------------
%

    % There are almost certainly better ones out there...
    %
string_hash_pred =
    ( pred(S::in, H1::out, H2::out) is det :-
        H1 = string__hash(S),
        H2 = string__foldl(func(C, N) = char__to_int(C) + N, S, 0)
    ).

%
----------------------------------------------------------------------------
%

fold(Fn, HT, X) = fold_0(0, Fn, HT, X).



:- func fold_0(int, func(K, V, T) = T, hash_table(K,V), T) = T.
:- mode fold_0(in, func(in,in,in) = out is det, hash_table_ui, in) = out is
det.
:- mode fold_0(in, func(in,in,di) = uo is det, hash_table_ui, di) = uo is
det.
% :- mode fold_0(in, func(in,in,in) = out is det, in, in) = out is det.
% :- mode fold_0(in, func(in,in,di) = uo is det, in, di) = uo is det.

fold_0(I, Fn, HT, X) =
    ( if I >= HT ^ num_buckets then
        X
      else if bitmap__is_clear(HT ^ bitmap, I) then
        fold_0(I + 1, Fn, HT, X)
      else
        fold_0(I + 1, Fn, HT, Fn(HT ^ keys ^ elem(I), HT ^ values ^ elem(I),
X))
    ).

%
----------------------------------------------------------------------------
%
%
----------------------------------------------------------------------------
%



--
Ralph Becket      |      MSR Cambridge      |      rbeck at microsoft.com 
--------------------------------------------------------------------------
mercury-developers mailing list
Post messages to:       mercury-developers at cs.mu.oz.au
Administrative Queries: owner-mercury-developers at cs.mu.oz.au
Subscriptions:          mercury-developers-request at cs.mu.oz.au
--------------------------------------------------------------------------



More information about the developers mailing list