[m-rev.] for review: speed up uint32 to string conversion

Julien Fischer jfischer at opturion.com
Wed Oct 27 16:10:18 AEDT 2021


Hi,

The following is the output of the benchmark contained in this diff on my Linux
server (x86_64; CentOS 7; gcc 4.8.5, glibc 2.17); I get comparable results on
macOS (10.14.6; Apple clang 1001.0.46.4) and Windows (MinGW64; gcc 10.3.0)

     n = 10000000; repeats = 20, grade = asm_fast.gc
     seed: a = 5691454532950328060, b = 8741801350575209016, c = 18328690396456478222
      Std: 24440ms
     Alt1: 8290ms ratio: 2.95
     Alt2: 8220ms ratio: 2.97
     Alt3: 8320ms ratio: 2.94

     n = 10000000; repeats = 20, grade = hlc.gc
     seed: a = 7457373037195609628, b = 16782061460301219139, c = 16477900985025701128
      Std: 22940ms
     Alt1: 6500ms ratio: 3.53
     Alt2: 6540ms ratio: 3.51
     Alt3: 6910ms ratio: 3.32

If anyone would care to try the benchmark on other system, I would be curious
to know the result.

------------------------------

Speed up uint32 to string conversion in C grades.

In C grades, uint32_to_string/1 is currently implemented by doing the
following:

  1. Calling sprintf() to write the string into a buffer on the stack.
  2. Calling strlen() on the buffer to determine the actual number of digits.
  3. Allocate the appropriate amount of space on the heap.
  4. Copying the string from the buffer on to the heap.

The benchmark included in this diff contains a number of alternative
implementations, all of which avoid much of the above overhead.
All of these implementations work by computing the required number of digits,
allocating space on the heap and then writing the digits into the string
backwards.

library/string.m:
     Replace uint32_to_string with a faster implementation.
     (This is alternative 3 in the benchmark program below.)

tests/hard_coded/Mmakefile:
tests/hard_coded/uint32_to_string.{m,exp}:
     A test of uint32 to string.

benchmarks/progs/integer_to_string/uint32_conversion.m:
     A program for benchmarking various uint32 to string conversion
     implementations.

Julien.

diff --git a/benchmarks/progs/integer_to_string/uint32_conversion.m b/benchmarks/progs/integer_to_string/uint32_conversion.m
new file mode 100644
index 0000000..77b0059
--- /dev/null
+++ b/benchmarks/progs/integer_to_string/uint32_conversion.m
@@ -0,0 +1,278 @@
+%---------------------------------------------------------------------------%
+% vim: ft=mercury ts=4 sw=4 et
+%---------------------------------------------------------------------------%
+
+:- module uint32_conversion.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is cc_multi.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module array.
+:- import_module benchmarking.
+:- import_module float.
+:- import_module list.
+:- import_module maybe.
+:- import_module random.
+:- import_module random.sfc64.
+:- import_module random.system_rng.
+:- import_module require.
+:- import_module string.
+:- import_module uint32.
+:- import_module unit.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    NumRepeats = 20,
+    NumElements = 10_000_000,
+    io.format("n = %d; repeats = %d, grade = %s\n",
+        [i(NumElements), i(NumRepeats), s($grade)], !IO),
+    randomly_fill_array(NumElements, Array, SeedA, SeedB, SeedC, !IO),
+    io.format("seed: a = %u, b = %u, c = %u\n",
+        [u64(SeedA), u64(SeedB), u64(SeedC)], !IO),
+    benchmark_det_io(run_test(std_uint32_to_string), Array, _, !IO,
+        NumRepeats, TimeStd),
+    io.format(" Std: %dms\n", [i(TimeStd)], !IO),
+    benchmark_det_io(run_test(alt1_uint32_to_string), Array, _, !IO,
+        NumRepeats, TimeAlt1),
+    io.format("Alt1: %dms ratio: %.2f\n",
+        [i(TimeAlt1), f(float(TimeStd) / float(TimeAlt1))], !IO),
+    benchmark_det_io(run_test(alt2_uint32_to_string), Array, _, !IO,
+        NumRepeats, TimeAlt2),
+    io.format("Alt2: %dms ratio: %.2f\n",
+        [i(TimeAlt2), f(float(TimeStd) / float(TimeAlt2))], !IO),
+    benchmark_det_io(run_test(alt3_uint32_to_string), Array, _, !IO,
+        NumRepeats, TimeAlt3),
+    io.format("Alt3: %dms ratio: %.2f\n",
+        [i(TimeAlt3), f(float(TimeStd) / float(TimeAlt3))], !IO).
+
+%---------------------------------------------------------------------------%
+
+:- pred randomly_fill_array(int::in, array(uint32)::array_uo,
+    uint64::out, uint64::out, uint64::out, io::di, io::uo) is det.
+
+randomly_fill_array(Size, Array, A, B, C, !IO) :-
+    open_system_rng(MaybeSysRNG, !IO),
+    (
+        MaybeSysRNG = ok(SysRNG)
+    ;
+        MaybeSysRNG = error(Error),
+        error(string(Error))
+    ),
+    system_rng.generate_uint64(SysRNG, A, !IO),
+    system_rng.generate_uint64(SysRNG, B, !IO),
+    system_rng.generate_uint64(SysRNG, C, !IO),
+    close_system_rng(SysRNG, !IO),
+    sfc64.seed(A, B, C, Params, State0),
+    array.generate_foldl(Size, generate_int(Params), Array, State0, _State).
+
+:- pred generate_int(sfc64.params::in, int::in, uint32::out,
+    sfc64.ustate::di, sfc64.ustate::uo) is det.
+
+generate_int(Params, _I, N, !State) :-
+    random.generate_uint32(Params, N, !State).
+
+%---------------------------------------------------------------------------%
+
+:- pred run_test((func(uint32) = string)::in(func(in) = uo is det),
+    array(uint32)::in, unit::out, io::di, io::uo) is det.
+
+run_test(Func, Array, unit, !IO) :-
+    array.foldl(do_test(Func), Array, !IO).
+
+%---------------------------------------------------------------------------%
+
+:- pred do_test((func(uint32) = string)::in(func(in) = uo is det),
+    uint32::in, io::di, io::uo) is det.
+
+do_test(Func, N, !IO) :-
+    S = Func(N),
+    consume_string(S, !IO).
+
+%---------------------------------------------------------------------------%
+
+:- pragma no_inline(pred(consume_string/3)).
+:- pred consume_string(string::in, io::di, io::uo) is det.
+:- pragma foreign_proc("C",
+    consume_string(S::in, _IO0::di, _IO::uo),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    // S
+").
+:- pragma foreign_proc("C#",
+    consume_string(S::in, _IO0::di, _IO::uo),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    // S
+").
+:- pragma foreign_proc("Java",
+    consume_string(S::in, _IO0::di, _IO::uo),
+    [will_not_call_mercury, promise_pure, thread_safe],
+"
+    // S
+").
+
+%---------------------------------------------------------------------------%
+%
+% Std implementation using sprintf().
+%
+
+:- func std_uint32_to_string(uint32::in) = (string::uo) is det.
+:- pragma foreign_proc("C",
+    std_uint32_to_string(U32::in) = (S::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
+"
+    char buffer[11]; // 10 for digits, 1 for nul.
+    sprintf(buffer, ""%"" PRIu32, U32);
+    MR_allocate_aligned_string_msg(S, strlen(buffer), MR_ALLOC_ID);
+    strcpy(S, buffer);
+").
+
+%---------------------------------------------------------------------------%
+
+:- func alt1_uint32_to_string(uint32::in) = (string::uo) is det.
+:- pragma foreign_proc("C",
+    alt1_uint32_to_string(U::in) = (S::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
+"
+    int num_digits;
+    if (U < 10) {
+        num_digits = 1;
+    } else if (U < 100) {
+        num_digits = 2;
+    } else if (U < 1000) {
+        num_digits = 3;
+    } else if (U < 10000) {
+        num_digits = 4;
+    } else if (U < 100000) {
+        num_digits = 5;
+    } else if (U < 1000000) {
+        num_digits = 6;
+    } else if (U < 10000000) {
+        num_digits = 7;
+    } else if (U < 100000000) {
+        num_digits = 8;
+    } else if (U < 1000000000) {
+        num_digits = 9;
+    } else {
+        num_digits = 10;
+    }
+
+    MR_allocate_aligned_string_msg(S, num_digits, MR_ALLOC_ID);
+    S[num_digits] = '\\0';
+    int i = num_digits - 1;
+    do {
+        S[i] = \"0123456789\"[U % 10];
+        i--;
+    } while ( U /= 10 );
+").
+
+%---------------------------------------------------------------------------%
+
+% Same as alt1 except it uses Andrei Alexandrescu's digit counting method.
+% See: <https://www.facebook.com/notes/10158791579037200/>
+% This is biased in favour of small numbers, it doesn't really help for
+% this benchmark.
+%
+:- func alt2_uint32_to_string(uint32::in) = (string::uo) is det.
+:- pragma foreign_proc("C",
+    alt2_uint32_to_string(U::in) = (S::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
+"
+    uint32_t v = U;
+    int num_digits = 1;
+    for (;;) {
+        if (v < 10) { break; }
+        if (v < 100) { num_digits += 1; break; }
+        if (v < 1000) { num_digits +=2; break; }
+        if (v < 10000) { num_digits += 3; break; }
+        v /= UINT32_C(10000);
+        num_digits += 4;
+    }
+
+    if (U < 10) {
+        num_digits = 1;
+    } else if (U < 100) {
+        num_digits = 2;
+    } else if (U < 1000) {
+        num_digits = 3;
+    } else if (U < 10000) {
+        num_digits = 4;
+    } else if (U < 100000) {
+        num_digits = 5;
+    } else if (U < 1000000) {
+        num_digits = 6;
+    } else if (U < 10000000) {
+        num_digits = 7;
+    } else if (U < 100000000) {
+        num_digits = 8;
+    } else if (U < 1000000000) {
+        num_digits = 9;
+    } else {
+        num_digits = 10;
+    }
+
+    MR_allocate_aligned_string_msg(S, num_digits, MR_ALLOC_ID);
+    S[num_digits] = '\\0';
+    int i = num_digits - 1;
+    do {
+        S[i] = \"0123456789\"[U % 10];
+        i--;
+    } while ( U /= 10 );
+").
+
+%---------------------------------------------------------------------------%
+
+% Use addition to compute digit chars instead of an array lookup.
+
+:- func alt3_uint32_to_string(uint32::in) = (string::uo) is det.
+:- pragma foreign_proc("C",
+    alt3_uint32_to_string(U::in) = (S::uo),
+    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
+"
+    int num_digits;
+    if (U < 10) {
+        num_digits = 1;
+    } else if (U < 100) {
+        num_digits = 2;
+    } else if (U < 1000) {
+        num_digits = 3;
+    } else if (U < 10000) {
+        num_digits = 4;
+    } else if (U < 100000) {
+        num_digits = 5;
+    } else if (U < 1000000) {
+        num_digits = 6;
+    } else if (U < 10000000) {
+        num_digits = 7;
+    } else if (U < 100000000) {
+        num_digits = 8;
+    } else if (U < 1000000000) {
+        num_digits = 9;
+    } else {
+        num_digits = 10;
+    }
+
+    MR_allocate_aligned_string_msg(S, num_digits, MR_ALLOC_ID);
+    S[num_digits] = '\\0';
+    int i = num_digits - 1;
+    do {
+        S[i] = '0' + (U % 10);
+        i--;
+    } while ( U /= 10 );
+").
+
+%---------------------------------------------------------------------------%
+
+% XXX TODO: try table driven approaches.
+
+%---------------------------------------------------------------------------%
+:- end_module uint32_conversion.
+%---------------------------------------------------------------------------%
diff --git a/library/string.m b/library/string.m
index edee14b..3f4d85d 100644
--- a/library/string.m
+++ b/library/string.m
@@ -5977,10 +5977,36 @@ uint_to_hex_string(UInt) =
      uint32_to_string(U32::in) = (S::uo),
      [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail],
  "
-    char buffer[11]; // 10 for digits, 1 for nul.
-    sprintf(buffer, ""%"" PRIu32, U32);
-    MR_allocate_aligned_string_msg(S, strlen(buffer), MR_ALLOC_ID);
-    strcpy(S, buffer);
+    int num_digits;
+    if (U32 < 10) {
+        num_digits = 1;
+    } else if (U32 < 100) {
+        num_digits = 2;
+    } else if (U32 < 1000) {
+        num_digits = 3;
+    } else if (U32 < 10000) {
+        num_digits = 4;
+    } else if (U32 < 100000) {
+        num_digits = 5;
+    } else if (U32 < 1000000) {
+        num_digits = 6;
+    } else if (U32 < 10000000) {
+        num_digits = 7;
+    } else if (U32 < 100000000) {
+        num_digits = 8;
+    } else if (U32 < 1000000000) {
+        num_digits = 9;
+    } else {
+        num_digits = 10;
+    }
+
+    MR_allocate_aligned_string_msg(S, num_digits, MR_ALLOC_ID);
+    S[num_digits] = '\\0';
+    int i = num_digits - 1;
+    do {
+        S[i] = '0' + (U32 % 10);
+        i--;
+    } while ( U32 /= 10 );
  ").

  :- pragma foreign_proc("C#",
diff --git a/tests/hard_coded/Mmakefile b/tests/hard_coded/Mmakefile
index 7e35a1d..4b105e2 100644
--- a/tests/hard_coded/Mmakefile
+++ b/tests/hard_coded/Mmakefile
@@ -460,6 +460,7 @@ ORDINARY_PROGS = \
  	uint16_switch_test \
  	uint32_from_bytes \
  	uint32_switch_test \
+	uint32_to_string \
  	uint32_to_uint64 \
  	uint64_from_bytes \
  	uint64_ground_term \
diff --git a/tests/hard_coded/uint32_to_string.exp b/tests/hard_coded/uint32_to_string.exp
new file mode 100644
index 0000000..1ade810
--- /dev/null
+++ b/tests/hard_coded/uint32_to_string.exp
@@ -0,0 +1,46 @@
+0
+1
+2
+7
+8
+9
+10
+11
+99
+100
+101
+127
+128
+129
+254
+255
+256
+999
+1000
+1001
+9999
+10000
+10001
+32766
+32767
+32768
+65534
+65535
+65536
+99999
+100000
+100001
+999999
+1000000
+1000001
+9999999
+10000000
+10000001
+99999999
+100000000
+100000001
+999999999
+1000000000
+1000000001
+2147483647
+4294967295
diff --git a/tests/hard_coded/uint32_to_string.m b/tests/hard_coded/uint32_to_string.m
new file mode 100644
index 0000000..ee8df01
--- /dev/null
+++ b/tests/hard_coded/uint32_to_string.m
@@ -0,0 +1,87 @@
+%---------------------------------------------------------------------------%
+% vim: ft=mercury ts=4 sw=4 et
+%---------------------------------------------------------------------------%
+%
+% A test of uint32 to string conversion.
+%
+%---------------------------------------------------------------------------%
+
+:- module uint32_to_string.
+:- interface.
+
+:- import_module io.
+
+:- pred main(io::di, io::uo) is det.
+
+%---------------------------------------------------------------------------%
+%---------------------------------------------------------------------------%
+
+:- implementation.
+
+:- import_module int32.
+:- import_module list.
+:- import_module string.
+:- import_module uint32.
+
+%---------------------------------------------------------------------------%
+
+main(!IO) :-
+    P = (pred(U::in, !.IO::di, !:IO::uo) is det :-
+        io.print_line(uint32_to_string(U), !IO)
+    ),
+    list.foldl(P, test_numbers, !IO).
+
+:- func test_numbers = list(uint32).
+
+test_numbers = [
+    0u32,
+    1u32,
+    2u32,
+    7u32,
+    8u32,
+    9u32,
+    10u32,
+    11u32,
+    99u32,
+    100u32,
+    101u32,
+    127u32,
+    128u32,
+    129u32,
+    254u32,
+    255u32,
+    256u32,
+    999u32,
+    1000u32,
+    1001u32,
+    9999u32,
+    10000u32,
+    10001u32,
+    32766u32,
+    32767u32,
+    32768u32,
+    65534u32,
+    65535u32,
+    65536u32,
+    99999u32,
+    100000u32,
+    100001u32,
+    999999u32,
+    1000000u32,
+    1000001u32,
+    9999999u32,
+    10000000u32,
+    10000001u32,
+    99999999u32,
+    100000000u32,
+    100000001u32,
+    999999999u32,
+    1000000000u32,
+    1000000001u32,
+    uint32.cast_from_int32(int32.max_int32),
+    uint32.max_uint32
+].
+
+%---------------------------------------------------------------------------%
+:- end_module uint32_to_string.
+%---------------------------------------------------------------------------%


More information about the reviews mailing list