[m-rev.] for review: make characters an instance of the uenum typeclass

Julien Fischer jfischer at opturion.com
Tue Dec 20 18:00:32 AEDT 2022


For review by anyone.

---------------------

Make characters an instance of the uenum typeclass

The recent change to sparse_bitsets broke the lex library in extras.
Specifically, we now now need to make characters an instance of the
uenum typeclass. This diff does so.

library/char.m:
      Add predicates and functions for converting between unsigned integers
      and characters.

      Make characters an instance of the uenum typeclass.

tests/hard_coded/Mmakefile:
tests/hard_coded/char_uint_conv.{m,exp,exp2}:
      Add a test of the above conversions.

NEWS:
      Announce the additions.

extras/lex.m:
      Conform to recent changes.

Julien.

diff --git a/NEWS b/NEWS
index a5322b4..913abd3 100644
--- a/NEWS
+++ b/NEWS
@@ -102,6 +102,18 @@ Changes to the Mercury standard library
      - func `promise_only_solution/1`
      - pred `promise_only_solution_io/4`

+### Changes to the `char` module
+
+* The following type has had its typeclass memberships changed:
+
+    - The type `character` is now an instance of the new `uenum` typeclass.
+
+* The following predicate and functions have been added:
+
+   - func `to_uint/1`
+   - pred `from_uint/2`
+   - func `det_from_uint/1`
+
  ### Changes to the `cord` module

  * The following predicates have been added:
diff --git a/extras/lex/lex.m b/extras/lex/lex.m
index 1f934c2..5e8722b 100644
--- a/extras/lex/lex.m
+++ b/extras/lex/lex.m
@@ -98,7 +98,7 @@
  :- instance regexp(regexp).
  :- instance regexp(char).
  :- instance regexp(string).
-:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)).
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)).

      % Some basic non-primitive regexps.
      %
@@ -720,10 +720,10 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-
          )
  ].

-:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)) where [
      re(SparseBitset) = charset(Charset) :-
          Charset = sparse_bitset.foldl(
-            func(Enum, Set0) = insert(Set0, char.det_from_int(to_int(Enum))),
+            func(Enum, Set0) = insert(Set0, char.det_from_uint(to_uint(Enum))),
              SparseBitset,
              sparse_bitset.init)
  ].
diff --git a/library/char.m b/library/char.m
index ba6e093..de29c6e 100644
--- a/library/char.m
+++ b/library/char.m
@@ -36,6 +36,7 @@
  :- type char == character.

  :- instance enum(character).
+:- instance uenum(character).

      % `to_int'/1 and `to_int(in, out)' convert a character to its
      % corresponding numerical code (integer value).
@@ -71,6 +72,19 @@
  :- func det_from_int(int) = char.
  :- pred det_from_int(int::in, char::out) is det.

+    % Converts a character to its numerical character code (unsigned integer).
+    %
+:- func to_uint(char) = uint.
+
+    % Converts an unsigned integer to its corresponding character, if any.
+    %
+:- pred from_uint(uint::in, char::out) is semidet.
+
+    % Converts an unsigned integer to its corresponding character.
+    % Throws an exception if there isn't one.
+    %
+:- func det_from_uint(uint) = char.
+
      % Returns the minimum numerical character code.
      %
  :- func min_char_value = int.
@@ -402,6 +416,11 @@
          to_int(Y, X))
  ].

+:- instance uenum(character) where [
+    func(to_uint/1) is char.to_uint,
+    pred(from_uint/2) is char.from_uint
+].
+
  :- pragma foreign_decl("C", "#include <limits.h>").

  %---------------------------------------------------------------------------%
@@ -493,6 +512,62 @@ det_from_int(Int, Char) :-

  %---------------------------------------------------------------------------%

+:- pragma foreign_proc("C",
+    to_uint(Character::in) = (UInt::out),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    UInt = (MR_UnsignedChar) Character;
+").
+
+:- pragma foreign_proc("C#",
+    to_uint(Character::in) = (UInt::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    UInt = (uint) Character;
+").
+
+:- pragma foreign_proc("Java",
+    to_uint(Character::in) = (UInt::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    UInt = Character;
+").
+
+:- pragma foreign_proc("C",
+    from_uint(UInt::in, Character::out),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+        does_not_affect_liveness],
+"
+    Character = (MR_UnsignedChar) UInt;
+    SUCCESS_INDICATOR = (UInt <= 0x10ffff);
+").
+
+:- pragma foreign_proc("C#",
+    from_uint(UInt::in, Character::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Character = (int) UInt;
+    SUCCESS_INDICATOR = (UInt <= 0x10ffff);
+").
+
+:- pragma foreign_proc("Java",
+    from_uint(UInt::in, Character::out),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    Character = UInt;
+    SUCCESS_INDICATOR = ((UInt & 0xffffffffL) <= (0x10ffff & 0xffffffffL));
+").
+
+det_from_uint(UInt) = Char :-
+    ( if char.from_uint(UInt, CharPrime) then
+        Char = CharPrime
+    else
+        unexpected($pred, "conversion failed")
+    ).
+
+%---------------------------------------------------------------------------%
+
  min_char_value = N :-
      min_char_value(N).

diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 6fbd606..1f016ed 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -59,6 +59,7 @@ ORDINARY_PROGS = \
  	change_hunk_test \
  	char_not_surrogate \
  	char_signed \
+	char_uint_conv \
  	char_unicode \
  	checked_nondet_tailcall \
  	checked_nondet_tailcall_noinline \
diff --git a/tests/hard_coded/char_uint_conv.exp b/tests/hard_coded/char_uint_conv.exp
index e69de29..341068b 100644
--- a/tests/hard_coded/char_uint_conv.exp
+++ b/tests/hard_coded/char_uint_conv.exp
@@ -0,0 +1,41 @@
+Testing uint -> char conversion:
+from_uint(0x0) ==> '\000\'
+from_uint(0x1) ==> '\001\'
+from_uint(0x7f) ==> '\177\'
+from_uint(0x80) ==> '\200\'
+from_uint(0x7ff) ==> '߿'
+from_uint(0x800) ==> 'ࠀ'
+from_uint(0xffff) ==> '￿'
+from_uint(0x10000) ==> '𐀀'
+from_uint(0x10ffff) ==> '􏿿'
+from_uint(0x110000) ==> FAILED
+from_uint(0xfffffffffffffffe) ==> FAILED
+from_uint(0xffffffffffffffff) ==> FAILED
+
+Testing char -> uint conversion:
+to_uint('\000\') ==> 0x0
+to_uint('\001\') ==> 0x1
+to_uint('\037\') ==> 0x1f
+to_uint(' ') ==> 0x20
+to_uint('0') ==> 0x30
+to_uint('A') ==> 0x41
+to_uint('Z') ==> 0x5a
+to_uint('a') ==> 0x61
+to_uint('z') ==> 0x7a
+to_uint('~') ==> 0x7e
+to_uint('\177\') ==> 0x7f
+to_uint('\200\') ==> 0x80
+to_uint('\237\') ==> 0x9f
+to_uint(' ') ==> 0xa0
+to_uint('¿') ==> 0xbf
+to_uint('À') ==> 0xc0
+to_uint('ÿ') ==> 0xff
+to_uint('𐀀') ==> 0x10000
+to_uint('𐁝') ==> 0x1005d
+to_uint('🬀') ==> 0x1fb00
+to_uint('🯹') ==> 0x1fbf9
+to_uint('𠀀') ==> 0x20000
+to_uint('𪛟') ==> 0x2a6df
+to_uint('丽') ==> 0x2f800
+to_uint('𯨟') ==> 0x2fa1f
+to_uint('􏿿') ==> 0x10ffff
diff --git a/tests/hard_coded/char_uint_conv.exp2 b/tests/hard_coded/char_uint_conv.exp2
index e69de29..05c9c16 100644
--- a/tests/hard_coded/char_uint_conv.exp2
+++ b/tests/hard_coded/char_uint_conv.exp2
@@ -0,0 +1,41 @@
+Testing uint -> char conversion:
+from_uint(0x0) ==> '\000\'
+from_uint(0x1) ==> '\001\'
+from_uint(0x7f) ==> '\177\'
+from_uint(0x80) ==> '\200\'
+from_uint(0x7ff) ==> '߿'
+from_uint(0x800) ==> 'ࠀ'
+from_uint(0xffff) ==> '￿'
+from_uint(0x10000) ==> '𐀀'
+from_uint(0x10ffff) ==> '􏿿'
+from_uint(0x110000) ==> FAILED
+from_uint(0xfffffffe) ==> FAILED
+from_uint(0xffffffff) ==> FAILED
+
+Testing char -> uint conversion:
+to_uint('\000\') ==> 0x0
+to_uint('\001\') ==> 0x1
+to_uint('\037\') ==> 0x1f
+to_uint(' ') ==> 0x20
+to_uint('0') ==> 0x30
+to_uint('A') ==> 0x41
+to_uint('Z') ==> 0x5a
+to_uint('a') ==> 0x61
+to_uint('z') ==> 0x7a
+to_uint('~') ==> 0x7e
+to_uint('\177\') ==> 0x7f
+to_uint('\200\') ==> 0x80
+to_uint('\237\') ==> 0x9f
+to_uint(' ') ==> 0xa0
+to_uint('¿') ==> 0xbf
+to_uint('À') ==> 0xc0
+to_uint('ÿ') ==> 0xff
+to_uint('𐀀') ==> 0x10000
+to_uint('𐁝') ==> 0x1005d
+to_uint('🬀') ==> 0x1fb00
+to_uint('🯹') ==> 0x1fbf9
+to_uint('𠀀') ==> 0x20000
+to_uint('𪛟') ==> 0x2a6df
+to_uint('丽') ==> 0x2f800
+to_uint('𯨟') ==> 0x2fa1f
+to_uint('􏿿') ==> 0x10ffff
diff --git a/tests/hard_coded/char_uint_conv.m b/tests/hard_coded/char_uint_conv.m
index e69de29..bc59026 100644
--- a/tests/hard_coded/char_uint_conv.m
+++ b/tests/hard_coded/char_uint_conv.m
@@ -0,0 +1,111 @@
+%---------------------------------------------------------------------------%
+% vim: ft=mercury ts=4 sw=4 et
+%---------------------------------------------------------------------------%
+% Test conversion of uints to characters and vice versa.
+% The .exp file is for grades where uint is 64 bits.
+% The .exp2 file is for grades where uint is 32 bits.
+%---------------------------------------------------------------------------%
+
+:- module char_uint_conv.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+:- import_module uint.
+:- import_module term_io.
+
+main(!IO) :-
+    io.print_line("Testing uint -> char conversion:", !IO),
+    list.foldl(test_from_uint, test_uints, !IO),
+    io.nl(!IO),
+    io.print_line("Testing char -> uint conversion:", !IO),
+    list.foldl(test_to_uint, test_chars, !IO).
+
+:- pred test_from_uint(uint::in, io::di, io::uo) is det.
+
+test_from_uint(UInt, !IO) :-
+    ( if char.from_uint(UInt, Char) then
+        io.format("from_uint(0x%x) ==> %s\n", [u(UInt), s(quoted_char(Char))], !IO)
+    else
+        io.format("from_uint(0x%x) ==> FAILED\n", [u(UInt)], !IO)
+    ).
+
+:- func test_uints = list(uint).
+
+test_uints = [
+    0x0_u,
+    1_u,
+    0x7f_u,
+    0x80_u,
+    0x7ff_u,
+    0x800_u,
+    0xffff_u,
+    0x10000_u,
+    0x10ffff_u,
+    0x110000_u,
+    max_uint - 1u,
+    max_uint].
+
+:- pred test_to_uint(char::in, io::di, io::uo) is det.
+
+test_to_uint(Char, !IO) :-
+    UInt = char.to_uint(Char),
+    io.format("to_uint(%s) ==> 0x%x\n", [s(quoted_char(Char)), u(UInt)], !IO).
+
+:- func test_chars = list(char).
+
+test_chars = [
+    % C0 Controls and Basic Latin
+    % (First BMP block)
+    char.det_from_int(0x0),   % NULL CHARACTER
+    char.det_from_int(0x1),   % START OF HEADING
+    char.det_from_int(0x1f),  % UNIT SEPARATOR
+    ' ',
+    '0',
+    'A',
+    'Z',
+    'a',
+    'z',
+    '~',
+    char.det_from_int(0x7f), % DELETE
+
+    % C1 Controls and Latin-1 Supplement
+    char.det_from_int(0x80),   % PADDING CHARACTER
+    char.det_from_int(0x9f),   % APPLICATION PROGRAM COMMAND
+    char.det_from_int(0xa0),   % NO-BREAK SPACE
+    char.det_from_int(0xbf),   % INVERTED QUESTION MARK
+    char.det_from_int(0xc0),   % LATIN CAPITAL LETTER A WITH GRAVE
+    char.det_from_int(0xff),   % LATIN SMALL LETTER Y WITH DIAERESIS
+
+    % Linear B Syllabary
+    % (First SMP block)
+    char.det_from_int(0x10000), % LINEAR B SYLLABLE B008 A
+    char.det_from_int(0x1005d), % LINEAR B SYMBOL B089
+
+    % Symbols for Legacy Computing.
+    % (Last SMP block)
+    char.det_from_int(0x1fb00), % BLOCK SEXTANT-1
+    char.det_from_int(0x1fbf9), % SEGEMENTED DIGIT NINE
+
+    % CJK Unified Idenographs Extension B
+    % (First SIP block)
+    char.det_from_int(0x20000),
+    char.det_from_int(0x2a6df),
+
+    % CJK Compatibility Ideographs Supplement
+    % (Last SIP block),
+    char.det_from_int(0x2f800),
+    char.det_from_int(0x2fa1f),
+
+    % Last valid Unicode code point.
+    char.det_from_int(0x10ffff)].


More information about the reviews mailing list