[m-rev.] For review: Loop control runtime support.

Paul Bone pbone at csse.unimelb.edu.au
Fri Sep 9 16:04:46 AEST 2011


For review by Zoltan.

Note that there are many XXXs marking opportunities for optimization, I expect
to revisit these and perhaps even the design (we discussed a different idea for
what to do when no worker is available some weeks ago).

---

Introduce loop control runtime support.

runtime/mercury_par_builtin.h:
runtime/mercury_par_builtin.c:
    Introduce loop control runtime code.

runtime/mercury_context.h:
    Introduce a new new macro to tune the size of contexts that are used as
    workers by the loop control runtime.  This is set to the same context size
    as for sparks.

runtime/mercury_context.c:
    Fixed a typeo in a comment.

library/par_builtin.m:
    Create predicate versions of the par builtin macros runtime code.  The only
    primitive without a predicate version is MR_lc_spawn_off which cannot be
    expressed in Mercury and needs support from the LLDS stage in the compiler.

mdbcomp/program_representation.m:
    Add par_builtin.lc_finish/1 as an externally defined predicate.  This tells
    the debugger not to expect any events for it.

Index: library/par_builtin.m
===================================================================
RCS file: /home/mercury1/repository/mercury/library/par_builtin.m,v
retrieving revision 1.26
diff -u -p -b -r1.26 par_builtin.m
--- library/par_builtin.m	31 May 2011 07:06:14 -0000	1.26
+++ library/par_builtin.m	9 Sep 2011 05:57:00 -0000
@@ -69,6 +69,35 @@
 :- impure pred signal_future(future(T)::in, T::in) is det.
 
 %---------------------------------------------------------------------------%
+%
+% These types and predicates are used by the loop control transformation.
+%
+
+:- type loop_control.
+
+:- type loop_control_slot.
+
+    % Create a loop control structure, see MR_lc_create in mercury_par_builtin.h
+    %
+:- pred lc_create(int::in, loop_control::out) is det.
+
+    % Finish a loop and finalize a loop control structure, see MR_lc_finish in
+    % mercury_par_builtin.h
+
+:- impure pred lc_finish(loop_control::in) is det.
+
+    % Allocate a free slot from the loop control structure and return it.
+    % See MR_lc_try_get_free_slot in mercury_par_builtin.h
+    %
+:- impure pred lc_free_slot(loop_control::in, loop_control_slot::out) is semidet.
+
+    % Finish one iteration of the loop.  This call does not return.
+    % See MR_lc_join_and_terminate in mercury_par_builtin.h
+    %
+:- impure pred lc_join_and_terminate(loop_control::in, loop_control_slot::in)
+    is det.
+
+%-----------------------------------------------------------------------------%
 
 % The following predicates are intended to be used as conditions to decide
 % whether the conjuncts of a conjunction should be executed in parallel or
@@ -237,6 +266,167 @@ INIT mercury_sys_init_par_builtin_module
 
 %-----------------------------------------------------------------------------%
 
+:- pragma foreign_type("C", loop_control, "MR_LoopControl *",
+    [can_pass_as_mercury_type]).
+
+    % Placeholders only.
+:- pragma foreign_type(il, loop_control, "class [mscorlib]System.Object").
+:- pragma foreign_type("Erlang", loop_control, "").
+:- pragma foreign_type("C#", loop_control, "object").
+:- pragma foreign_type("Java", loop_control, "java.lang.Object").
+
+:- pragma foreign_type("C", loop_control_slot, "MR_LoopControlSlot *",
+    [can_pass_as_mercury_type]).
+
+    % Placeholders only.
+:- pragma foreign_type(il, loop_control_slot, "class [mscorlib]System.Object").
+:- pragma foreign_type("Erlang", loop_control_slot, "").
+:- pragma foreign_type("C#", loop_control_slot, "object").
+:- pragma foreign_type("Java", loop_control_slot, "java.lang.Object").
+
+:- pragma foreign_proc("C",
+
+    lc_create(NumWorkers::in, LC::out),
+    [will_not_call_mercury, will_not_throw_exception, thread_safe,
+     promise_pure],
+"
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+    LC = MR_lc_create(NumWorkers);
+#else
+    MR_fatal_error(""lc_create is unavailable in this grade"");
+#endif
+").
+
+
+%
+% IMPORTANT: any changes or additions to external predicates should be
+% reflected in the definition of pred_is_external in
+% mdbcomp/program_representation.m.  The debugger needs to know what predicates
+% are defined externally, so that it knows not to expect events for those
+% predicates.
+%
+:- external(lc_finish/1).
+
+:- pragma foreign_code("C",
+"
+
+void mercury_sys_init_lc_init(void);
+void mercury_sys_init_lc_init_type_tables(void);
+#ifdef  MR_DEEP_PROFILING
+void mercury_sys_init_lc_write_out_proc_statics(FILE *deep_fp,
+    FILE *procrep_fp);
+#endif
+
+#ifndef MR_HIGHLEVEL_CODE
+
+MR_def_extern_entry(par_builtin__lc_finish_1_0)
+
+MR_decl_label1(par_builtin__lc_finish_1_0, 1)
+
+MR_BEGIN_MODULE(par_builtin_module_lc_finish)
+    MR_init_entry1(par_builtin__lc_finish_1_0);
+    MR_INIT_PROC_LAYOUT_ADDR(mercury__par_builtin__lc_finish_1_0);
+    MR_init_label1(par_builtin__lc_finish_1_0,1);
+MR_BEGIN_CODE
+
+#ifdef MR_maybe_local_thread_engine_base
+    #undef MR_maybe_local_thread_engine_base
+    #define MR_maybe_local_thread_engine_base MR_local_thread_engine_base
+#endif
+
+MR_define_entry(mercury__par_builtin__lc_finish_1_0)
+    MR_MAYBE_INIT_LOCAL_THREAD_ENGINE_BASE
+
+    MR_incr_sp(1);
+    MR_sv(1) = MR_r1; /* LC */
+
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+    {
+        MR_LoopControl* LC = (MR_LoopControl*)MR_r1;
+        MR_lc_finish_part1(LC, par_builtin__lc_finish_1_0_i1);
+    }
+#else
+    MR_fatal_error(""lc_finish is unavailable in this grade"");
+#endif
+
+MR_def_label(par_builtin__lc_finish_1_0,1)
+    MR_MAYBE_INIT_LOCAL_THREAD_ENGINE_BASE
+
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+    {
+        MR_LoopControl* LC = (MR_LoopControl*)MR_sv(1);
+        MR_lc_finish_part2(LC);
+    }
+#endif
+
+    MR_decr_sp(1);
+    MR_proceed();
+
+#ifdef MR_maybe_local_thread_engine_base
+    #undef MR_maybe_local_thread_engine_base
+    #define MR_maybe_local_thread_engine_base MR_thread_engine_base
+#endif
+MR_END_MODULE
+
+#endif /* ! MR_HIGHLEVEL_CODE */
+
+/*
+** Module initialization
+*/
+/*
+INIT mercury_sys_init_lc
+*/
+
+void
+mercury_sys_init_lc_init(void)
+{
+#ifndef MR_HIGHLEVEL_CODE
+    par_builtin_module_lc_finish();
+#endif
+}
+
+void
+mercury_sys_init_lc_init_type_tables(void)
+{
+    /* no types to register */
+}
+
+#ifdef  MR_DEEP_PROFILING
+void
+mercury_sys_init_lc_write_out_proc_statics(FILE *deep_fp,
+    FILE *procrep_fp)
+{
+    /* The deep profiler shouldn't notice loop control predicates. */
+}
+#endif
+
+").
+
+:- pragma foreign_proc("C",
+    lc_free_slot(LC::in, LCS::out),
+    [will_not_call_mercury, will_not_throw_exception, thread_safe],
+"
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+    LCS = MR_lc_try_get_free_slot(LC);
+    SUCCESS_INDICATOR = (LCS != NULL);
+#else
+    MR_fatal_error(""lc_free_slot is unavailable in this grade"");
+#endif
+").
+
+:- pragma foreign_proc("C",
+    lc_join_and_terminate(LC::in, LCS::in),
+    [will_not_call_mercury, will_not_throw_exception, thread_safe],
+"
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+    MR_lc_join_and_terminate(LC, LCS);
+#else
+    MR_fatal_error(""lc_join_and_terminate is unavailable in this grade"");
+#endif
+").
+
+%-----------------------------------------------------------------------------%
+
 :- pragma foreign_proc("C",
     num_os_threads(NThreads::out),
     [will_not_call_mercury, will_not_throw_exception, thread_safe,
Index: mdbcomp/program_representation.m
===================================================================
RCS file: /home/mercury1/repository/mercury/mdbcomp/program_representation.m,v
retrieving revision 1.62
diff -u -p -b -r1.62 program_representation.m
--- mdbcomp/program_representation.m	31 May 2011 03:14:21 -0000	1.62
+++ mdbcomp/program_representation.m	9 Sep 2011 05:57:00 -0000
@@ -1578,6 +1578,7 @@ pred_is_external("builtin", "compare", 4
 pred_is_external("builtin", "compare_representation", 4).
 pred_is_external("backjump", "builtin_choice_id", 1).
 pred_is_external("backjump", "builtin_backjump", 1).
+pred_is_external("par_builtin", "lc_finish", 1).
 
 %-----------------------------------------------------------------------------%
 
Index: runtime/mercury_context.c
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_context.c,v
retrieving revision 1.97
diff -u -p -b -r1.97 mercury_context.c
--- runtime/mercury_context.c	8 Sep 2011 01:53:08 -0000	1.97
+++ runtime/mercury_context.c	9 Sep 2011 05:45:58 -0000
@@ -1253,7 +1253,7 @@ MR_schedule_context(MR_Context *ctxt)
                 &wake_action_data, NULL))
             {
                 /*
-                ** THe context has been given to a engine.
+                ** The context has been given to a engine.
                 */
                 return;
             }
Index: runtime/mercury_context.h
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_context.h,v
retrieving revision 1.68
diff -u -p -b -r1.68 mercury_context.h
--- runtime/mercury_context.h	24 May 2011 04:16:48 -0000	1.68
+++ runtime/mercury_context.h	9 Sep 2011 05:45:58 -0000
@@ -219,8 +219,10 @@ typedef enum {
 
 #ifdef MR_STACK_SEGMENTS
 #define MR_CONTEXT_SIZE_FOR_SPARK MR_CONTEXT_SIZE_REGULAR
+#define MR_CONTEXT_SIZE_FOR_LOOP_CONTROL_WORKER MR_CONTEXT_SIZE_REGULAR
 #else
 #define MR_CONTEXT_SIZE_FOR_SPARK MR_CONTEXT_SIZE_SMALL
+#define MR_CONTEXT_SIZE_FOR_LOOP_CONTROL_WORKER MR_CONTEXT_SIZE_SMALL
 #endif
 
 #ifdef MR_THREAD_SAFE
Index: runtime/mercury_par_builtin.c
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_par_builtin.c,v
retrieving revision 1.1
diff -u -p -b -r1.1 mercury_par_builtin.c
--- runtime/mercury_par_builtin.c	11 Jun 2009 08:28:32 -0000	1.1
+++ runtime/mercury_par_builtin.c	9 Sep 2011 05:56:55 -0000
@@ -10,6 +10,12 @@ vim: ft=c ts=4 sw=4 et
 #include "mercury_types.h"
 #include "mercury_par_builtin.h"
 
+/***************************************************************************
+**
+** Futures for decedent AND-parallelism..
+**
+***************************************************************************/
+
 #ifdef MR_CONSERVATIVE_GC
 
 void
@@ -25,3 +31,120 @@ MR_finalize_future(void *obj, void *cd)
 }
 
 #endif
+
+/***************************************************************************
+**
+** Code for Loop controls..
+**
+***************************************************************************/
+
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+
+MR_LoopControl* MR_lc_create(unsigned num_workers)
+{
+    MR_LoopControl* lc;
+    unsigned        i;
+
+    lc = MR_GC_malloc(sizeof(MR_LoopControl));
+    lc->MR_lc_slots = MR_GC_malloc(sizeof(MR_LoopControlSlot) * num_workers);
+    for (i = 0; i < num_workers; i++)
+    {
+        /*
+        ** We allocate contexts as necessary, so that we never allocate a
+        ** context we don't use.  Also by delaying the allocation of contexts
+        ** all but the first may execute in parallel with one-another.
+        */
+        lc->MR_lc_slots[i].MR_lcs_context = NULL;
+        lc->MR_lc_slots[i].MR_lcs_is_free = MR_TRUE;
+    }
+    lc->MR_lc_num_slots = num_workers;
+    lc->MR_lc_outstanding_workers = 0;
+    lc->MR_lc_waiting_context = NULL;
+    pthread_mutex_init(&(lc->MR_lc_lock), MR_MUTEX_ATTR);
+
+    return lc;
+}
+
+MR_LoopControlSlot* MR_lc_try_get_free_slot(MR_LoopControl* lc)
+{
+    if (lc->MR_lc_outstanding_workers == lc->MR_lc_num_slots) {
+        return NULL;
+    } else {
+        unsigned i;
+
+        /*
+        ** Optimize this by using a hint to start the search at.
+        */
+        for (i = 0; i<lc->MR_lc_num_slots; i++) {
+            if (lc->MR_lc_slots[i].MR_lcs_is_free == MR_TRUE) {
+                lc->MR_lc_slots[i].MR_lcs_is_free = MR_FALSE;
+                MR_atomic_inc_int(&(lc->MR_lc_outstanding_workers));
+                return &(lc->MR_lc_slots[i]);
+            }
+        }
+
+        return NULL;
+    }
+}
+
+void MR_lc_spawn_off_(MR_LoopControlSlot* lcs, MR_Code* code_ptr)
+{
+    if (lcs->MR_lcs_context == NULL) {
+        /*
+        ** We need a new context.
+        */
+        lcs->MR_lcs_context = MR_create_context("Loop control",
+            MR_CONTEXT_SIZE_FOR_LOOP_CONTROL_WORKER, NULL);
+        lcs->MR_lcs_context->MR_ctxt_thread_local_mutables = MR_THREAD_LOCAL_MUTABLES;
+    }
+
+    lcs->MR_lcs_context->MR_ctxt_resume = code_ptr;
+    lcs->MR_lcs_context->MR_ctxt_parent_sp = MR_sp;
+    MR_schedule_context(lcs->MR_lcs_context);
+}
+
+void MR_lc_join(MR_LoopControl* lc, MR_LoopControlSlot* lcs)
+{
+    MR_bool     last_worker;
+    MR_Context  *wakeup_context;
+
+    lcs->MR_lcs_is_free = MR_TRUE;
+    last_worker = MR_atomic_dec_and_is_zero_int(&(lc->MR_lc_outstanding_workers));
+    /*
+    ** This barrier ensures we update MR_lc_outstanding_contexts before we read
+    ** MR_lc_finished, it works with another barrier in MR_lc_finish().
+    ** Together these barriers prevent a race where by the original thread is
+    ** not resumed because MR_lc_finished looked false in the condition below
+    ** but last_worker was true and the original thread is about to go to
+    ** sleep.
+    **
+    ** We go through these checks to avoid taking the lock in the then branch
+    ** below in cases when MR_lc_outstanding_workers is zero but the original
+    ** thread has not called MR_lc_finish() yet.
+    */
+    MR_CPU_MFENCE;
+    if (last_worker && lc->MR_lc_finished) {
+        /*
+        ** Wake up the first thread if it is sleeping.
+        ** XXX: a spinlock would do here, or maybe a CAS, we never hold the lock for long.
+        */
+        MR_LOCK(&(lc->MR_lc_lock), "MC_lc_join_and_terminate");
+        wakeup_context = lc->MR_lc_waiting_context;
+        /*
+        ** We don't need to clear the context field at this point, only one
+        ** worker can ever be the last worker and therefore there's no danger
+        ** in adding this context to the run queue twice.
+        */
+        MR_UNLOCK(&(lc->MR_lc_lock), "MR_lc_join_and_terminate");
+        if (wakeup_context != NULL) {
+            /*
+            ** XXX: it's faster to switch to this context ourselves since we're
+            ** going to unload our own context.
+            */
+            MR_schedule_context(wakeup_context);
+        }
+    }
+}
+
+#endif /* MR_THREAD_SAFE && MR_LL_PARALLEL_CONJ */
+
Index: runtime/mercury_par_builtin.h
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_par_builtin.h,v
retrieving revision 1.6
diff -u -p -b -r1.6 mercury_par_builtin.h
--- runtime/mercury_par_builtin.h	31 May 2011 03:14:21 -0000	1.6
+++ runtime/mercury_par_builtin.h	9 Sep 2011 05:56:55 -0000
@@ -9,10 +9,10 @@ vim: ft=c ts=4 sw=4 et
 
 /*
 ** This module contains the definitions of the primitive operations we use to
-** implement dependent AND-parallelism as C macros. The macros can be invoked
-** either from the predicates representing the operations in par_builtin.m
-** in the library directory, or from a foreign_proc version of those
-** predicates inserted directly into compiler-generated code.
+** implement builtins for parallel grades as C macros. Some of these macros can
+** be invoked either from the predicates representing the operations in
+** par_builtin.m in the library directory, or from a foreign_proc version of
+** those predicates inserted directly into compiler-generated code.
 */
 
 #ifndef MERCURY_PAR_BUILTIN_H
@@ -23,6 +23,12 @@ vim: ft=c ts=4 sw=4 et
 #include "mercury_threadscope.h"
 #include "mercury_atomic_ops.h"
 
+/***************************************************************************
+**
+** Futures for dependant AND parallelism.
+**
+***************************************************************************/
+
 #ifdef MR_THREAD_SAFE
 
     struct MR_Future_Struct {
@@ -278,4 +284,147 @@ vim: ft=c ts=4 sw=4 et
 
 #endif
 
+/***************************************************************************
+**
+** Builtins for loop coordination.
+**
+***************************************************************************/
+
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+
+typedef struct MR_LoopControl_Struct        MR_LoopControl;
+
+typedef struct MR_LoopControlSlot_Struct    MR_LoopControlSlot;
+
+struct MR_LoopControl_Struct
+{
+    MR_LoopControlSlot*                 MR_lc_slots;
+    unsigned                            MR_lc_num_slots;
+    MR_THREADSAFE_VOLATILE MR_Integer   MR_lc_outstanding_workers;
+    MR_Context*                         MR_lc_waiting_context;
+    MR_THREADSAFE_VOLATILE MR_bool      MR_lc_finished;
+    MercuryLock                         MR_lc_lock;
+};
+
+struct MR_LoopControlSlot_Struct
+{
+    MR_Context*         MR_lcs_context;
+    MR_bool             MR_lcs_is_free;
+};
+
+#else
+
+/*
+** We have to define these types so that par_builtin.m can use them as foreign
+** types, even in grades that don't support them.
+*/
+
+typedef MR_Word MR_LoopControl;
+typedef MR_Word MR_LoopControlSlot;
+
+#endif
+
+#if defined(MR_THREAD_SAFE) && defined(MR_LL_PARALLEL_CONJ)
+
+/*
+** XXX: Make these functions macros, they're functions now to make debugging
+** and testing easier.
+*/
+
+/*
+** Create and initialize a loop control structure.
+*/
+extern MR_LoopControl* MR_lc_create(unsigned num_workers);
+
+/*
+** Wait for all workers and then finalize and free the loop control structure,
+** The caller must pass a module-unique (unquoted) string for resume_point_name
+** that will be used in the name for a C label.
+*/
+#define MR_lc_finish_part1(lc, label)                                       \
+    do {                                                                    \
+        (lc)->MR_lc_finished = MR_TRUE;                                     \
+        /*                                                                  \
+        ** This barrier ensures that MR_lc_finished before we read          \
+        ** MR_lc_outstanding_contexts, it works with another barrier in     \
+        ** MR_lc_join_and_terminate().  See MR_lc_join_and_terminate().     \
+        */                                                                  \
+        MR_CPU_MFENCE;                                                      \
+        if ((lc)->MR_lc_outstanding_workers > 0) {                          \
+            /*                                                              \
+            ** This context must wait until the workers are finished.       \
+            ** This must be implemented as a macro, we cannot move the C    \
+            ** stack pointer without extra work.                            \
+            */                                                              \
+            MR_ENGINE(MR_eng_this_context)->MR_ctxt_resume =                \
+                MR_LABEL(MR_add_prefix(label));                             \
+            MR_LOCK(&((lc)->MR_lc_lock), "MR_lc_finish_part1");             \
+            if ((lc)->MR_lc_outstanding_workers == 0) {                     \
+                MR_UNLOCK(&((lc)->MR_lc_lock), "MR_lc_finish_part1");       \
+                MR_GOTO_LOCAL(MR_add_prefix(label));                        \
+            }                                                               \
+            MR_save_context(MR_ENGINE(MR_eng_this_context));                \
+            (lc)->MR_lc_waiting_context = MR_ENGINE(MR_eng_this_context);   \
+            MR_UNLOCK(&((lc)->MR_lc_lock), "MR_lc_finish_part1");           \
+            MR_ENGINE(MR_eng_this_context) = NULL;                          \
+            MR_idle(); /* Release the engine to the idle loop */            \
+        }                                                                   \
+    } while (0);
+
+#define MR_lc_finish_part2(lc)                                              \
+    do {                                                                    \
+        unsigned i;                                                         \
+                                                                            \
+        /*                                                                  \
+        ** All the jobs have finished,                                      \
+        */                                                                  \
+        for (i = 0; i < (lc)->MR_lc_num_slots; i++) {                       \
+            if ((lc)->MR_lc_slots[i].MR_lcs_context != NULL) {              \
+                MR_destroy_context((lc)->MR_lc_slots[i].MR_lcs_context);    \
+            }                                                               \
+        }                                                                   \
+        pthread_mutex_destroy(&((lc)->MR_lc_lock));                         \
+    } while (0);
+
+/*
+** Get a free slot in the loop control if there is one.
+*/
+extern MR_LoopControlSlot* MR_lc_try_get_free_slot(MR_LoopControl* lc);
+
+/*
+** Try to spawn off this code using the free slot.
+*/
+#define MR_lc_spawn_off(lcs, label) \
+    MR_lc_spawn_off_((lcs), MR_LABEL(MR_add_prefix(label)))
+
+extern void MR_lc_spawn_off_(MR_LoopControlSlot* lcs, MR_Code* code_ptr);
+
+/*
+** Join and termiante a worker.
+*/
+#define MR_lc_join_and_terminate(lc, lcs)                                   \
+    do {                                                                    \
+        MR_lc_join((lc), (lcs));                                            \
+                                                                            \
+        /*                                                                  \
+        ** Termination of this context must be handled in a macro so that   \
+        ** C's stack pointer is set correctly.  It might appear that we     \
+        ** don't need to save the context since it doesn't hold a           \
+        ** computation, but we do so that we save bookkeeping information.  \
+        ** A similar mistake was the cause of a hard-to-diagnose bug in     \
+        ** parallel stack segments grades.                                  \
+        */                                                                  \
+        MR_save_context(MR_ENGINE(MR_eng_this_context));                    \
+        MR_ENGINE(MR_eng_this_context) = NULL;                              \
+        MR_idle();                                                          \
+    } while (0);
+
+/*
+** Join a worker context with the main thread.  Termination of the context
+** is handled in the macro above.
+*/
+extern void MR_lc_join(MR_LoopControl* lc, MR_LoopControlSlot* lcs);
+
+#endif /* MR_THREAD_SAFE && MR_LL_PARALLEL_CONJ */
+
 #endif /* not MERCURY_PAR_BUILTIN_H */
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 489 bytes
Desc: Digital signature
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20110909/73cdb1ce/attachment.sig>


More information about the reviews mailing list