[m-rev.] for review: make characters an instance of the uenum typeclass
Julien Fischer
jfischer at opturion.com
Tue Dec 20 18:00:32 AEDT 2022
For review by anyone.
---------------------
Make characters an instance of the uenum typeclass
The recent change to sparse_bitsets broke the lex library in extras.
Specifically, we now now need to make characters an instance of the
uenum typeclass. This diff does so.
library/char.m:
Add predicates and functions for converting between unsigned integers
and characters.
Make characters an instance of the uenum typeclass.
tests/hard_coded/Mmakefile:
tests/hard_coded/char_uint_conv.{m,exp,exp2}:
Add a test of the above conversions.
NEWS:
Announce the additions.
extras/lex.m:
Conform to recent changes.
Julien.
diff --git a/NEWS b/NEWS
index a5322b4..913abd3 100644
--- a/NEWS
+++ b/NEWS
@@ -102,6 +102,18 @@ Changes to the Mercury standard library
- func `promise_only_solution/1`
- pred `promise_only_solution_io/4`
+### Changes to the `char` module
+
+* The following type has had its typeclass memberships changed:
+
+ - The type `character` is now an instance of the new `uenum` typeclass.
+
+* The following predicate and functions have been added:
+
+ - func `to_uint/1`
+ - pred `from_uint/2`
+ - func `det_from_uint/1`
+
### Changes to the `cord` module
* The following predicates have been added:
diff --git a/extras/lex/lex.m b/extras/lex/lex.m
index 1f934c2..5e8722b 100644
--- a/extras/lex/lex.m
+++ b/extras/lex/lex.m
@@ -98,7 +98,7 @@
:- instance regexp(regexp).
:- instance regexp(char).
:- instance regexp(string).
-:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)).
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)).
% Some basic non-primitive regexps.
%
@@ -720,10 +720,10 @@ read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-
)
].
-:- instance regexp(sparse_bitset(T)) <= (regexp(T),enum(T)) where [
+:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)) where [
re(SparseBitset) = charset(Charset) :-
Charset = sparse_bitset.foldl(
- func(Enum, Set0) = insert(Set0, char.det_from_int(to_int(Enum))),
+ func(Enum, Set0) = insert(Set0, char.det_from_uint(to_uint(Enum))),
SparseBitset,
sparse_bitset.init)
].
diff --git a/library/char.m b/library/char.m
index ba6e093..de29c6e 100644
--- a/library/char.m
+++ b/library/char.m
@@ -36,6 +36,7 @@
:- type char == character.
:- instance enum(character).
+:- instance uenum(character).
% `to_int'/1 and `to_int(in, out)' convert a character to its
% corresponding numerical code (integer value).
@@ -71,6 +72,19 @@
:- func det_from_int(int) = char.
:- pred det_from_int(int::in, char::out) is det.
+ % Converts a character to its numerical character code (unsigned integer).
+ %
+:- func to_uint(char) = uint.
+
+ % Converts an unsigned integer to its corresponding character, if any.
+ %
+:- pred from_uint(uint::in, char::out) is semidet.
+
+ % Converts an unsigned integer to its corresponding character.
+ % Throws an exception if there isn't one.
+ %
+:- func det_from_uint(uint) = char.
+
% Returns the minimum numerical character code.
%
:- func min_char_value = int.
@@ -402,6 +416,11 @@
to_int(Y, X))
].
+:- instance uenum(character) where [
+ func(to_uint/1) is char.to_uint,
+ pred(from_uint/2) is char.from_uint
+].
+
:- pragma foreign_decl("C", "#include <limits.h>").
%---------------------------------------------------------------------------%
@@ -493,6 +512,62 @@ det_from_int(Int, Char) :-
%---------------------------------------------------------------------------%
+:- pragma foreign_proc("C",
+ to_uint(Character::in) = (UInt::out),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness],
+"
+ UInt = (MR_UnsignedChar) Character;
+").
+
+:- pragma foreign_proc("C#",
+ to_uint(Character::in) = (UInt::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ UInt = (uint) Character;
+").
+
+:- pragma foreign_proc("Java",
+ to_uint(Character::in) = (UInt::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ UInt = Character;
+").
+
+:- pragma foreign_proc("C",
+ from_uint(UInt::in, Character::out),
+ [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
+ does_not_affect_liveness],
+"
+ Character = (MR_UnsignedChar) UInt;
+ SUCCESS_INDICATOR = (UInt <= 0x10ffff);
+").
+
+:- pragma foreign_proc("C#",
+ from_uint(UInt::in, Character::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Character = (int) UInt;
+ SUCCESS_INDICATOR = (UInt <= 0x10ffff);
+").
+
+:- pragma foreign_proc("Java",
+ from_uint(UInt::in, Character::out),
+ [will_not_call_mercury, promise_pure, thread_safe],
+"
+ Character = UInt;
+ SUCCESS_INDICATOR = ((UInt & 0xffffffffL) <= (0x10ffff & 0xffffffffL));
+").
+
+det_from_uint(UInt) = Char :-
+ ( if char.from_uint(UInt, CharPrime) then
+ Char = CharPrime
+ else
+ unexpected($pred, "conversion failed")
+ ).
+
+%---------------------------------------------------------------------------%
+
min_char_value = N :-
min_char_value(N).
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 6fbd606..1f016ed 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -59,6 +59,7 @@ ORDINARY_PROGS = \
change_hunk_test \
char_not_surrogate \
char_signed \
+ char_uint_conv \
char_unicode \
checked_nondet_tailcall \
checked_nondet_tailcall_noinline \
diff --git a/tests/hard_coded/char_uint_conv.exp b/tests/hard_coded/char_uint_conv.exp
index e69de29..341068b 100644
--- a/tests/hard_coded/char_uint_conv.exp
+++ b/tests/hard_coded/char_uint_conv.exp
@@ -0,0 +1,41 @@
+Testing uint -> char conversion:
+from_uint(0x0) ==> '\000\'
+from_uint(0x1) ==> '\001\'
+from_uint(0x7f) ==> '\177\'
+from_uint(0x80) ==> '\200\'
+from_uint(0x7ff) ==> '߿'
+from_uint(0x800) ==> 'ࠀ'
+from_uint(0xffff) ==> ''
+from_uint(0x10000) ==> '𐀀'
+from_uint(0x10ffff) ==> ''
+from_uint(0x110000) ==> FAILED
+from_uint(0xfffffffffffffffe) ==> FAILED
+from_uint(0xffffffffffffffff) ==> FAILED
+
+Testing char -> uint conversion:
+to_uint('\000\') ==> 0x0
+to_uint('\001\') ==> 0x1
+to_uint('\037\') ==> 0x1f
+to_uint(' ') ==> 0x20
+to_uint('0') ==> 0x30
+to_uint('A') ==> 0x41
+to_uint('Z') ==> 0x5a
+to_uint('a') ==> 0x61
+to_uint('z') ==> 0x7a
+to_uint('~') ==> 0x7e
+to_uint('\177\') ==> 0x7f
+to_uint('\200\') ==> 0x80
+to_uint('\237\') ==> 0x9f
+to_uint(' ') ==> 0xa0
+to_uint('¿') ==> 0xbf
+to_uint('À') ==> 0xc0
+to_uint('ÿ') ==> 0xff
+to_uint('𐀀') ==> 0x10000
+to_uint('𐁝') ==> 0x1005d
+to_uint('🬀') ==> 0x1fb00
+to_uint('🯹') ==> 0x1fbf9
+to_uint('𠀀') ==> 0x20000
+to_uint('𪛟') ==> 0x2a6df
+to_uint('丽') ==> 0x2f800
+to_uint('') ==> 0x2fa1f
+to_uint('') ==> 0x10ffff
diff --git a/tests/hard_coded/char_uint_conv.exp2 b/tests/hard_coded/char_uint_conv.exp2
index e69de29..05c9c16 100644
--- a/tests/hard_coded/char_uint_conv.exp2
+++ b/tests/hard_coded/char_uint_conv.exp2
@@ -0,0 +1,41 @@
+Testing uint -> char conversion:
+from_uint(0x0) ==> '\000\'
+from_uint(0x1) ==> '\001\'
+from_uint(0x7f) ==> '\177\'
+from_uint(0x80) ==> '\200\'
+from_uint(0x7ff) ==> '߿'
+from_uint(0x800) ==> 'ࠀ'
+from_uint(0xffff) ==> ''
+from_uint(0x10000) ==> '𐀀'
+from_uint(0x10ffff) ==> ''
+from_uint(0x110000) ==> FAILED
+from_uint(0xfffffffe) ==> FAILED
+from_uint(0xffffffff) ==> FAILED
+
+Testing char -> uint conversion:
+to_uint('\000\') ==> 0x0
+to_uint('\001\') ==> 0x1
+to_uint('\037\') ==> 0x1f
+to_uint(' ') ==> 0x20
+to_uint('0') ==> 0x30
+to_uint('A') ==> 0x41
+to_uint('Z') ==> 0x5a
+to_uint('a') ==> 0x61
+to_uint('z') ==> 0x7a
+to_uint('~') ==> 0x7e
+to_uint('\177\') ==> 0x7f
+to_uint('\200\') ==> 0x80
+to_uint('\237\') ==> 0x9f
+to_uint(' ') ==> 0xa0
+to_uint('¿') ==> 0xbf
+to_uint('À') ==> 0xc0
+to_uint('ÿ') ==> 0xff
+to_uint('𐀀') ==> 0x10000
+to_uint('𐁝') ==> 0x1005d
+to_uint('🬀') ==> 0x1fb00
+to_uint('🯹') ==> 0x1fbf9
+to_uint('𠀀') ==> 0x20000
+to_uint('𪛟') ==> 0x2a6df
+to_uint('丽') ==> 0x2f800
+to_uint('') ==> 0x2fa1f
+to_uint('') ==> 0x10ffff
diff --git a/tests/hard_coded/char_uint_conv.m b/tests/hard_coded/char_uint_conv.m
index e69de29..bc59026 100644
--- a/tests/hard_coded/char_uint_conv.m
+++ b/tests/hard_coded/char_uint_conv.m
@@ -0,0 +1,111 @@
+%---------------------------------------------------------------------------%
+% vim: ft=mercury ts=4 sw=4 et
+%---------------------------------------------------------------------------%
+% Test conversion of uints to characters and vice versa.
+% The .exp file is for grades where uint is 64 bits.
+% The .exp2 file is for grades where uint is 32 bits.
+%---------------------------------------------------------------------------%
+
+:- module char_uint_conv.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module char.
+:- import_module list.
+:- import_module string.
+:- import_module uint.
+:- import_module term_io.
+
+main(!IO) :-
+ io.print_line("Testing uint -> char conversion:", !IO),
+ list.foldl(test_from_uint, test_uints, !IO),
+ io.nl(!IO),
+ io.print_line("Testing char -> uint conversion:", !IO),
+ list.foldl(test_to_uint, test_chars, !IO).
+
+:- pred test_from_uint(uint::in, io::di, io::uo) is det.
+
+test_from_uint(UInt, !IO) :-
+ ( if char.from_uint(UInt, Char) then
+ io.format("from_uint(0x%x) ==> %s\n", [u(UInt), s(quoted_char(Char))], !IO)
+ else
+ io.format("from_uint(0x%x) ==> FAILED\n", [u(UInt)], !IO)
+ ).
+
+:- func test_uints = list(uint).
+
+test_uints = [
+ 0x0_u,
+ 1_u,
+ 0x7f_u,
+ 0x80_u,
+ 0x7ff_u,
+ 0x800_u,
+ 0xffff_u,
+ 0x10000_u,
+ 0x10ffff_u,
+ 0x110000_u,
+ max_uint - 1u,
+ max_uint].
+
+:- pred test_to_uint(char::in, io::di, io::uo) is det.
+
+test_to_uint(Char, !IO) :-
+ UInt = char.to_uint(Char),
+ io.format("to_uint(%s) ==> 0x%x\n", [s(quoted_char(Char)), u(UInt)], !IO).
+
+:- func test_chars = list(char).
+
+test_chars = [
+ % C0 Controls and Basic Latin
+ % (First BMP block)
+ char.det_from_int(0x0), % NULL CHARACTER
+ char.det_from_int(0x1), % START OF HEADING
+ char.det_from_int(0x1f), % UNIT SEPARATOR
+ ' ',
+ '0',
+ 'A',
+ 'Z',
+ 'a',
+ 'z',
+ '~',
+ char.det_from_int(0x7f), % DELETE
+
+ % C1 Controls and Latin-1 Supplement
+ char.det_from_int(0x80), % PADDING CHARACTER
+ char.det_from_int(0x9f), % APPLICATION PROGRAM COMMAND
+ char.det_from_int(0xa0), % NO-BREAK SPACE
+ char.det_from_int(0xbf), % INVERTED QUESTION MARK
+ char.det_from_int(0xc0), % LATIN CAPITAL LETTER A WITH GRAVE
+ char.det_from_int(0xff), % LATIN SMALL LETTER Y WITH DIAERESIS
+
+ % Linear B Syllabary
+ % (First SMP block)
+ char.det_from_int(0x10000), % LINEAR B SYLLABLE B008 A
+ char.det_from_int(0x1005d), % LINEAR B SYMBOL B089
+
+ % Symbols for Legacy Computing.
+ % (Last SMP block)
+ char.det_from_int(0x1fb00), % BLOCK SEXTANT-1
+ char.det_from_int(0x1fbf9), % SEGEMENTED DIGIT NINE
+
+ % CJK Unified Idenographs Extension B
+ % (First SIP block)
+ char.det_from_int(0x20000),
+ char.det_from_int(0x2a6df),
+
+ % CJK Compatibility Ideographs Supplement
+ % (Last SIP block),
+ char.det_from_int(0x2f800),
+ char.det_from_int(0x2fa1f),
+
+ % Last valid Unicode code point.
+ char.det_from_int(0x10ffff)].
More information about the reviews
mailing list