[m-rev.] For review: Parallel runtime profiling improvments.

Paul Bone pbone at csse.unimelb.edu.au
Sun Aug 16 20:59:16 AEST 2009
Previous message: [m-rev.] diff: allow testing of java grade
Next message: [m-rev.] For review: Parallel runtime profiling improvments.
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
For post-commit review.  I have commited this already before realizing I
had some over-long lines in some files which I've now commited a fix
for.  The diff below is the final version, without over long lines.

Parallel runtime profiling improvements.

Improve the profiling of the parallel runtime code in two main ways:
	+ Record data for more events.
	+ Record high-precision timing data on x86 machines via the TSC where
	  access to the TSC is available.

Access to the TSC is available via two machine instructions.  RDTSC - read
TSC. and RDTSCP - read TSC and processor ID.  We prefer the latter as a
process migrated between two calls to RDTSC may cause an incorrect time
duration to be calculated (since TSC counts are seldom synchronized).  We
fall back to RDTSC when RDTSCP is not available and gracefully record no
timing information when neither is available.  Availability is detected via
the CPUID instruction, see MR_configure_profiling_timers().

runtime/mercury_context.c:
runtime/mercury_context.h:
	Runtime profiling changes as above.

runtime/mercury_atomic_ops.c:
runtime/mercury_atomic_ops.h:
	Add runtime profiling timing code.
	Add new add and subtract atomic operations.    

runtime/mercury_wrapper.c:
	Call the new MR_configure_profiling_timers() procedure to detect the CPU
	and configure access to the TSC.

Mmakefile:
runtime/Mmakefile:
	'mmake tags' at the top level now builds the tags file for the runtime
	directory.
	The tags target in the runtime directory is now marked as PHONY so it is
	generated even if it already exists.

Thanks.

Index: Mmakefile
===================================================================
RCS file: /home/mercury1/repository/mercury/Mmakefile,v
retrieving revision 1.135
diff -u -p -b -r1.135 Mmakefile
--- Mmakefile	29 Jul 2009 03:03:59 -0000	1.135
+++ Mmakefile	11 Aug 2009 23:52:34 -0000
@@ -296,7 +296,8 @@ tags: 	tags_library \
 	tags_compiler \
 	tags_slice \
 	tags_profiler \
-	tags_deep_profiler
+	tags_deep_profiler \
+	tags_runtime
 
 .PHONY: tags_compiler
 tags_compiler:
@@ -330,6 +331,10 @@ tags_profiler:
 tags_deep_profiler:
 	+cd deep_profiler && $(SUBDIR_MMAKE) tags
 
+.PHONY: tags_runtime
+tags_runtime:
+	+cd runtime && $(SUBDIR_MMAKE) tags
+
 #-----------------------------------------------------------------------------#
 
 # Remove from each of the listed directories mmc-generated files that don't
Index: runtime/Mmakefile
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/Mmakefile,v
retrieving revision 1.146
diff -u -p -b -r1.146 Mmakefile
--- runtime/Mmakefile	11 Jun 2009 08:28:31 -0000	1.146
+++ runtime/Mmakefile	11 Aug 2009 23:58:19 -0000
@@ -431,6 +431,7 @@ mercury_conf.h: mercury_conf.h.date
 .PHONY: cs
 cs:	$(CFILES)
 
+.PHONY: tags
 tags:	$(CFILES) $(HDRS) $(BODY_HDRS) $(LIB_DLL_H)
 	ctags $(CFILES) $(HDRS) $(BODY_HDRS) $(LIB_DLL_H)
 
Index: runtime/mercury_atomic_ops.c
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_atomic_ops.c,v
retrieving revision 1.3
diff -u -p -b -r1.3 mercury_atomic_ops.c
--- runtime/mercury_atomic_ops.c	17 Jun 2009 03:26:00 -0000	1.3
+++ runtime/mercury_atomic_ops.c	16 Aug 2009 06:00:40 -0000
@@ -36,7 +36,7 @@ MR_OUTLINE_DEFN(
     MR_atomic_inc_int(volatile MR_Integer *addr)
 ,
     {
-        MR_ATOMIC_INC_WORD_BODY;
+        MR_ATOMIC_INC_INT_BODY;
     }
 )
 
@@ -45,8 +45,310 @@ MR_OUTLINE_DEFN(
     MR_atomic_dec_int(volatile MR_Integer *addr)
 ,
     {
-        MR_ATOMIC_DEC_WORD_BODY;
+        MR_ATOMIC_DEC_INT_BODY;
+    }
+)
+
+MR_OUTLINE_DEFN(
+    void
+    MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
+,
+    {
+        MR_ATOMIC_ADD_INT_BODY;
+    }
+)
+
+MR_OUTLINE_DEFN(
+    void
+    MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
+,
+    {
+        MR_ATOMIC_SUB_INT_BODY;
     }
 )
 
 #endif /* MR_LL_PARALLEL_CONJ */
+
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+static MR_bool  MR_rdtscp_is_available = MR_FALSE;
+static MR_bool  MR_rdtsc_is_available = MR_FALSE;
+#endif
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+
+/* Set this to 1 to enable some printfs below */
+#define MR_DEBUG_CPU_FEATURE_DETECTION 0 
+
+/*
+** cpuid, rdtscp and rdtsc are i386/amd64 instructions.
+*/
+static __inline__ void
+MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
+    MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d);
+
+static __inline__ void
+MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id);
+
+static __inline__ void
+MR_rdtsc(MR_uint_least64_t *tsc);
+
+#endif
+
+extern void 
+MR_configure_profiling_timers(void) {
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+    MR_Unsigned     a, b, c, d;
+    MR_Unsigned     eflags, old_eflags;
+
+    /* 
+    ** Check for the CPUID instruction.  CPUID is supported if we can flip bit
+    ** 21 in the CPU's EFLAGS register.  The assembly below is written in a
+    ** subset of i386 and amd64 assembly.  To read and write EFLAGS we have
+    ** to go via the C stack.
+    */
+    __asm__ ("pushf; pop %0"
+            :"=r"(eflags));
+    old_eflags = eflags;
+    /* Flip bit 21 */
+    eflags ^= (1 << 21);
+    __asm__ ("push %0; popf; pushf; pop %0;"
+            :"=r"(eflags)
+            :"0"(eflags));
+
+    /*
+    ** Test to see if our change held.  We don't restore eflags, a change to
+    ** the ID bit has no effect.
+    */
+    if (eflags == old_eflags)
+    {
+#if MR_DEBUG_CPU_FEATURE_DETECTION
+        fprintf(stderr, "This CPU doesn't support the CPUID instruction.\n",
+            eflags, old_eflags);
+#endif
+        return;
+    }
+
+    /*
+    ** CPUID 0 gives the maximum basic CPUID page in EAX.  Basic pages go up to
+    ** but not including 0x40000000.
+    */
+    MR_cpuid(0, 0, &a, &b, &c, &d);
+    if (a < 1)
+        return;
+
+    /* CPUID 1 gives type, family, model and stepping information in EAX. */
+    MR_cpuid(1, 0, &a, &b, &c, &d);
+    
+    /* Bit 4 in EDX is high if RDTSC is available */
+    if (d & (1 << 4))
+        MR_rdtsc_is_available = MR_TRUE;
+
+    /*
+     * BTW: Intel can't count:
+     *
+     * http://www.pagetable.com/?p=18
+     * http://www.codinghorror.com/blog/archives/000364.html
+     *
+     * 486 (1989): family 4
+     * Pentium (1993): family 5
+     * Pentium Pro (1995): family 6, models 0 and 1
+     * Pentium 2 (1997): family 6, models 3, 5 and 6
+     * Pentium 3 (2000): family 6, models 7, 8, 10, 11
+     * Itanium (2001): family 7
+     * Pentium 4 (2000): family 15/0
+     * Itanium 2 (2002): family 15/1 and 15/2
+     * Pentium D: family 15/4
+     * Pentium M (2003): family 6, models 9 and 13
+     * Core (2006): family 6, model 14
+     * Core 2 (2006): family 6, model 15
+     * i7: family 6, model 26
+     * Atom: family 6, model 28
+     *
+     * This list is incomplete, it doesn't cover AMD or any other brand of x86
+     * processor, and it probably doesn't cover all post-pentium Intel
+     * processors.
+     */
+
+    /* bits 8-11 (first bit (LSB) is bit 0) */
+    MR_Unsigned extended_family, basic_family, family,
+        extended_model, model;
+    basic_family = (a & 0x00000F00) >> 8;
+    if (basic_family == 0x0F) {
+        /* bits 20-27 */
+        extended_family = (a & 0x0FF00000) >> 20;
+        family = basic_family + extended_family;
+    } else {
+        family = basic_family;
+    }
+    /* 
+    ** I'm not using the model value but I'll leave the code here incase we
+    ** have a reason to use it in the future.
+    */
+    /* bits 4-7 */
+    model = (a & 0x000000F0) >> 4;
+    if ((basic_family == 0x0F) || (basic_family == 0x06))
+    {
+        /* bits 16-19 */
+        extended_model = (a & 0x000F0000) >> 16;
+        model += (extended_model << 4);
+    }
+#if MR_DEBUG_CPU_FEATURE_DETECTION
+    fprintf(stderr, "This is family %d and model %d\n", family, model);
+#endif 
+
+    /* Now check for P3 or higher since they have the extended pages */
+    if (family < 6) {
+        /* This is a 486 or Pentium */
+        return;
+    }
+    /*
+    ** I could bail out here if this was a pentium 3, but there's a more
+    ** reliable check for extended CPUID support below that should work on AMD
+    ** chips as well, if I knew all the model numbers for all family 6
+    ** processors and knew if they honoured extended CPUID.
+    */
+
+    /*
+    ** Extended CPUID 0x80000000.
+    ** 
+    ** EAX contains the maximum extended CPUID node.
+    */
+    MR_cpuid(0x80000000, 0, &a, &b, &c, &d);
+    if ((a & 0x80000000) == 0) {
+        /* 
+        ** Extended CPUID is not supported.
+        ** Note that this check is still not as reliable as I'd like.  If it
+        ** succeeds I'm not confident that the processor definitely implements
+        ** extended CPUID.
+        */
+        return;
+    }
+#if MR_DEBUG_CPU_FEATURE_DETECTION
+    fprintf(stderr, "Maximum extended CPUID node: 0x%x\n", a);
+#endif
+    if (a < 0x80000001)
+        return;
+
+    /*
+    ** Extended CPUID 0x80000001
+    **
+    ** If EDX bit 27 is set the RDTSCP instruction is available.
+    */
+    MR_cpuid(0x80000001, 0, &a, &b, &c, &d);
+#if MR_DEBUG_CPU_FEATURE_DETECTION
+    fprintf(stderr, "CPUID 0x80000001 EDX: 0x%x\n", d);
+#endif
+    if (!(d & (1 << 27)))
+        return;
+   
+    /*
+    ** Support for RDTSCP appears to be present
+    */
+#if MR_DEBUG_CPU_FEATURE_DETECTION
+    fprintf(stderr, "RDTSCP is available\n");
+#endif
+    MR_rdtscp_is_available = MR_TRUE;
+
+#endif
+}
+
+extern void
+MR_profiling_start_timer(MR_Timer *timer) {
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+    /*
+    ** If we don't have enough data to fill in all the fields of this structure
+    ** we leave them alone, we won't check them later without checking
+    ** MR_rdtsc{p}_is_available first.
+    */
+    if (MR_rdtscp_is_available == MR_TRUE)
+    {
+        MR_rdtscp(&(timer->MR_timer_time), &(timer->MR_timer_processor_id));
+    }
+    else if (MR_rdtsc_is_available == MR_TRUE)
+    {
+        MR_rdtsc(&(timer->MR_timer_time));
+    }
+#endif
+}
+
+extern void
+MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats) {
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+    MR_Timer            now;
+    MR_int_least64_t    duration;
+    MR_uint_least64_t   duration_squared;
+
+    if (MR_rdtscp_is_available == MR_TRUE)
+    {
+        MR_rdtscp(&(now.MR_timer_time), &(now.MR_timer_processor_id));
+        if (timer->MR_timer_processor_id == now.MR_timer_processor_id)
+        {
+            duration = now.MR_timer_time - timer->MR_timer_time;
+            duration_squared = duration * duration;
+            MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
+            MR_atomic_add_int(&(stats->MR_stat_sum), duration);
+            MR_atomic_add_int(&(stats->MR_stat_sum_squares), duration_squared);
+        } else {
+            MR_atomic_inc_int(&(stats->MR_stat_count_not_recorded));
+        }
+    }
+    else if (MR_rdtsc_is_available == MR_TRUE)
+    {
+        MR_rdtsc(&(now.MR_timer_time));
+        duration = now.MR_timer_time - timer->MR_timer_time;
+        duration_squared = duration * duration;
+        MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
+        MR_atomic_add_int(&(stats->MR_stat_sum), duration);
+        MR_atomic_add_int(&(stats->MR_stat_sum_squares), duration_squared);
+    }
+#elif
+    /* No TSC support on this architecture or with this C compiler */
+    MR_atomic_inc_int(&(stats->MR_stat_count_recorded));
+#endif
+}
+
+/*
+** It's convenient that this instruction is the same on both i386 and amd64
+*/
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__))
+
+static __inline__ void 
+MR_cpuid(MR_Unsigned code, MR_Unsigned sub_code,
+        MR_Unsigned *a, MR_Unsigned *b, MR_Unsigned *c, MR_Unsigned *d) {
+    __asm__("cpuid"
+        : "=a"(*a), "=b"(*b), "=c"(*c), "=d"(*d)
+        : "0"(code), "2"(sub_code));
+}
+
+static __inline__ void
+MR_rdtscp(MR_uint_least64_t *tsc, MR_Unsigned *processor_id) {
+    MR_uint_least64_t tsc_high;
+
+    /*
+    ** On 64bit systems the high 32 bits of RAX and RDX are 0 filled by
+    ** rdtsc{p}
+    */
+    __asm__("rdtscp"
+           : "=a"(*tsc), "=d"(tsc_high), "=c"(*processor_id));
+
+    tsc_high = tsc_high << 32;
+    *tsc |= tsc_high; 
+}
+
+static __inline__ void
+MR_rdtsc(MR_uint_least64_t *tsc) {
+    MR_uint_least64_t tsc_high;
+
+    __asm__("rdtsc"
+           : "=a"(*tsc), "=d"(tsc_high));
+
+    tsc_high = tsc_high << 32;
+    *tsc |= tsc_high; 
+}
+
+#endif
+
+#endif
+
Index: runtime/mercury_atomic_ops.h
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_atomic_ops.h,v
retrieving revision 1.3
retrieving revision 1.5
diff -u -p -b -r1.3 -r1.5
--- runtime/mercury_atomic_ops.h	17 Jun 2009 03:26:00 -0000	1.3
+++ runtime/mercury_atomic_ops.h	16 Aug 2009 10:47:55 -0000	1.5
@@ -16,6 +16,16 @@
 
 #include "mercury_std.h"
 
+/*
+** AMD say that __amd64__ is defined by the compiler for 64bit platforms,
+** Intel say that __x86_64__ is the correct macro.  Really these refer to
+** the same thing that is simply branded differently, we use __amd64__ below
+** and define it if necessary ourselves.
+*/
+#if defined(__x86_64__) && !defined(__amd64__)
+#define __amd64__
+#endif
+
 /*---------------------------------------------------------------------------*/
 #if defined(MR_LL_PARALLEL_CONJ)
 
@@ -37,7 +47,7 @@ MR_compare_and_swap_word(volatile MR_Int
             return __sync_bool_compare_and_swap(addr, old, new_val);        \
         } while (0)
 
-#elif defined(__GNUC__) && defined(__x86_64__)
+#elif defined(__GNUC__) && defined(__amd64__)
 
     #define MR_COMPARE_AND_SWAP_WORD_BODY                                   \
         do {                                                                \
@@ -47,7 +57,6 @@ MR_compare_and_swap_word(volatile MR_Int
                 "lock; cmpxchgq %3, %0; setz %1"                            \
                 : "=m"(*addr), "=q"(result)                                 \
                 : "m"(*addr), "r" (new_val), "a"(old)                       \
-                : "memory"                                                  \
             );                                                              \
             return (int) result;                                            \
         } while (0)
@@ -63,7 +72,7 @@ MR_compare_and_swap_word(volatile MR_Int
                 "lock; cmpxchgl %3, %0; setz %1"                            \
                 : "=m"(*addr), "=q"(result)                                 \
                 : "m"(*addr), "r" (new_val), "a"(old)                       \
-                : "memory");                                                \
+                );                                                          \
             return (int) result;                                            \
         } while (0)
 
@@ -86,9 +95,9 @@ MR_compare_and_swap_word(volatile MR_Int
 MR_EXTERN_INLINE void
 MR_atomic_inc_int(volatile MR_Integer *addr);
 
-#if defined(__GNUC__) && defined(__x86_64__)
+#if defined(__GNUC__) && defined(__amd64__)
 
-    #define MR_ATOMIC_INC_WORD_BODY                                         \
+    #define MR_ATOMIC_INC_INT_BODY                                          \
         do {                                                                \
             __asm__ __volatile__(                                           \
                 "lock; incq %0;"                                            \
@@ -100,7 +109,7 @@ MR_atomic_inc_int(volatile MR_Integer *a
 #elif defined(__GNUC__) && defined(__i386__)
 
     /* Really 486 or better. */
-    #define MR_ATOMIC_INC_WORD_BODY                                         \
+    #define MR_ATOMIC_INC_INT_BODY                                          \
         do {                                                                \
             __asm__ __volatile__(                                           \
                 "lock; incl %0;"                                            \
@@ -109,25 +118,25 @@ MR_atomic_inc_int(volatile MR_Integer *a
                 );                                                          \
         } while (0)
 
-#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+#else
 
     /*
-    ** gcc doesn't seem to have an atomic operation for increment, it does have
-    ** one for add though.  We prefer the hand-written increment operations
-    ** above.
+    ** Fall back to an atomic add 1 operation.
+    **
+    ** We could fall back to the built-in GCC instructions but they also fetch
+    ** the value.  I believe this is more efficient.
+    **  - pbone
     */
-    #define MR_ATOMIC_INC_WORD_BODY                                         \
-        do {                                                                \
-            __sync_add_and_fetch(addr, 1);                                  \
-        } while (0)
+    #define MR_ATOMIC_INC_INT_BODY                                          \
+        MR_atomic_add_int(addr, 1)                                          \
 
 #endif
 
-#ifdef MR_ATOMIC_INC_WORD_BODY
+#ifdef MR_ATOMIC_INC_INT_BODY
     MR_EXTERN_INLINE void 
     MR_atomic_inc_int(volatile MR_Integer *addr)
     {
-        MR_ATOMIC_INC_WORD_BODY;
+        MR_ATOMIC_INC_INT_BODY;
     }
 #endif
 
@@ -139,9 +148,9 @@ MR_atomic_inc_int(volatile MR_Integer *a
 MR_EXTERN_INLINE void
 MR_atomic_dec_int(volatile MR_Integer *addr);
 
-#if defined(__GNUC__) && defined(__x86_64__)
+#if defined(__GNUC__) && defined(__amd64__)
 
-    #define MR_ATOMIC_DEC_WORD_BODY                                         \
+    #define MR_ATOMIC_DEC_INT_BODY                                          \
         do {                                                                \
             __asm__ __volatile__(                                           \
                 "lock; decq %0;"                                            \
@@ -153,7 +162,7 @@ MR_atomic_dec_int(volatile MR_Integer *a
 #elif defined(__GNUC__) && defined(__i386__)
 
     /* Really 486 or better. */
-    #define MR_ATOMIC_DEC_WORD_BODY                                         \
+    #define MR_ATOMIC_DEC_INT_BODY                                          \
         do {                                                                \
             __asm__ __volatile__(                                           \
                 "lock; decl %0;"                                            \
@@ -161,31 +170,148 @@ MR_atomic_dec_int(volatile MR_Integer *a
                 : "m"(*addr)                                                \
                 );                                                          \
         } while (0)
+#else
+    /*
+    ** Fall back to an atomic subtract 1 operation.
+    */
+
+    #define MR_ATOMIC_DEC_INT_BODY                                          \
+        MR_atomic_sub_int(addr, 1)
+
+#endif
+
+#ifdef MR_ATOMIC_DEC_INT_BODY
+    MR_EXTERN_INLINE void 
+    MR_atomic_dec_int(volatile MR_Integer *addr)
+    {
+        MR_ATOMIC_DEC_INT_BODY;
+    }
+#endif
+
+MR_EXTERN_INLINE void
+MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend);
+
+#if defined(__GNUC__) && defined(__amd64__)
+
+    #define MR_ATOMIC_ADD_INT_BODY                                          \
+        do {                                                                \
+            __asm__ __volatile__(                                           \
+                "lock; addq %2, %0"                                         \
+                : "=m"(*addr)                                               \
+                : "m"(*addr), "r"(addend)                                   \
+                );                                                          \
+        } while (0)
+    
+#elif defined(__GNUC__) && defined(__i386__)
+    
+    #define MR_ATOMIC_ADD_INT_BODY                                          \
+        do {                                                                \
+            __asm__ __volatile__(                                           \
+                "lock; addl %2, %0;"                                        \
+                : "=m"(*addr)                                               \
+                : "m"(*addr), "r"(addend)                                   \
+                );                                                          \
+        } while (0)
 
 #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
 
-    /*
-    ** gcc doesn't seem to have an atomic operation for increment, it does have
-    ** one for add though.  We prefer the hand-written increment operations
-    ** above.
-    */
-    #define MR_ATOMIC_DEC_WORD_BODY                                         \
+    #define MR_ATOMIC_ADD_INT_BODY                                          \
         do {                                                                \
-            __sync_sub_and_fetch(addr, 1);                                  \
+            __sync_add_and_fetch(addr, addend);                             \
         } while (0)
 
 #endif
 
-#ifdef MR_ATOMIC_DEC_WORD_BODY
+#ifdef MR_ATOMIC_ADD_INT_BODY
     MR_EXTERN_INLINE void 
-    MR_atomic_dec_int(volatile MR_Integer *addr)
+    MR_atomic_add_int(volatile MR_Integer *addr, MR_Integer addend)
+    {
+        MR_ATOMIC_ADD_INT_BODY;
+    }
+#endif
+
+MR_EXTERN_INLINE void
+MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x);
+
+#if defined(__GNUC__) && defined(__amd64__)
+
+    #define MR_ATOMIC_SUB_INT_BODY                                          \
+        do {                                                                \
+            __asm__ __volatile__(                                           \
+                "lock; subq %2, %0;"                                        \
+                : "=m"(*addr)                                               \
+                : "m"(*addr), "r"(x)                                        \
+                );                                                          \
+        } while (0)
+    
+#elif defined(__GNUC__) && defined(__i386__)
+    
+    #define MR_ATOMIC_SUB_INT_BODY                                          \
+        do {                                                                \
+            __asm__ __volatile__(                                           \
+                "lock; subl %2, %0;"                                        \
+                : "=m"(*addr)                                               \
+                : "m"(*addr), "r"(x)                                        \
+                );                                                          \
+        } while (0)
+
+#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+
+    #define MR_ATOMIC_SUB_INT_BODY                                          \
+        do {                                                                \
+            __sync_sub_and_fetch(addr, x);                                  \
+        } while (0)
+
+#endif
+
+#ifdef MR_ATOMIC_SUB_INT_BODY
+    MR_EXTERN_INLINE void
+    MR_atomic_sub_int(volatile MR_Integer *addr, MR_Integer x)
     {
-        MR_ATOMIC_DEC_WORD_BODY;
+        MR_ATOMIC_SUB_INT_BODY;
     }
 #endif
 
+/*
+ * Intel and AMD support a pause instruction that is roughly equivalent
+ * to a no-op.  Intel recommend that it is used in spin-loops to improve
+ * performance.  Without a pause instruction multiple simultaneous
+ * read-requests will be in-flight for the synchronization variable from a
+ * single thread.  Giving the pause instruction causes these to be executed
+ * in sequence allowing the processor to handle the change in the
+ * synchronization variable more easily.
+ *
+ * On some chips it may cause the spin-loop to use less power.
+ *
+ * This instruction was introduced with the Pentium 4 but is backwards
+ * compatible, This works because the two byte instruction for PAUSE is
+ * equivalent to the NOP instruction prefixed by REPE.  Therefore older
+ * processors perform a no-op.
+ *
+ * This is not really an atomic instruction but we name it
+ * MR_ATOMIC_PAUSE for consistency.
+ *
+ * References: Intel and AMD documentation for PAUSE, Intel optimisation
+ * guide.
+ */
+#if defined(__GNUC__) && ( defined(__i386__) || defined(__amd64__) )
+
+    #define MR_ATOMIC_PAUSE                                                 \
+        do {                                                                \
+            __asm__ __volatile__("pause");                                  \
+        } while(0)
+
+#else
+
+    /* Fall back to a no-op */
+    #define MR_ATOMIC_PAUSE                                                 \
+        do {                                                                \
+            ;                                                               \
+        } while(0)
+
+#endif
+
 #endif /* MR_LL_PARALLEL_CONJ */
-/*---------------------------------------------------------------------------*/
 
 /*
 ** If we don't have definitions available for this compiler or architecture
@@ -193,4 +319,52 @@ MR_atomic_dec_int(volatile MR_Integer *a
 ** currently require any atomic ops.
 */
 
+/*---------------------------------------------------------------------------*/
+
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
+
+typedef struct {
+    MR_Unsigned         MR_stat_count_recorded;
+    MR_Unsigned         MR_stat_count_not_recorded;
+        /*
+        ** The total number of times this event occurred is implicitly the
+        ** sum of the recorded and not_recorded counts.
+        */
+    MR_int_least64_t    MR_stat_sum;
+    MR_uint_least64_t   MR_stat_sum_squares;
+        /*
+        ** The sum of squares is used to calculate variance and standard
+        ** deviation.
+        */
+} MR_Stats;
+
+typedef struct {
+    MR_uint_least64_t   MR_timer_time;
+    MR_Unsigned         MR_timer_processor_id;
+} MR_Timer;
+
+/*
+** Configure the profiling stats code.  On i386 and amd64 machines this uses
+** CPUID to determine if the RDTSCP instruction is available and not prohibited
+** by the OS.
+*/
+extern void
+MR_configure_profiling_timers(void);
+
+/*
+** Start and initialize a timer structure.
+*/
+extern void
+MR_profiling_start_timer(MR_Timer *timer);
+
+/*
+** Stop the timer and update stats with the results.
+*/
+extern void
+MR_profiling_stop_timer(MR_Timer *timer, MR_Stats *stats);
+
+#endif /* MR_THREAD_SAFE && MR_PROFILE_PARALLEL_EXECUTION */
+
+/*---------------------------------------------------------------------------*/
+
 #endif /* not MERCURY_ATOMIC_OPS_H */
Index: runtime/mercury_context.c
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_context.c,v
retrieving revision 1.64
retrieving revision 1.66
diff -u -p -b -r1.64 -r1.66
--- runtime/mercury_context.c	13 Jul 2009 05:27:12 -0000	1.64
+++ runtime/mercury_context.c	16 Aug 2009 10:47:55 -0000	1.66
@@ -27,6 +27,9 @@ ENDINIT
 	#include <unistd.h>	/* for select() on OS X */
   #endif	
 #endif
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
+  #include <math.h> /* for sqrt and pow */
+#endif
 
 #include "mercury_memory_handlers.h"
 #include "mercury_context.h"
@@ -69,11 +72,26 @@ MR_PendingContext       *MR_pending_cont
 #if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
 MR_bool                 MR_profile_parallel_execution = MR_FALSE;
 
-static MR_Integer       MR_profile_parallel_executed_global_sparks = 0;
+static MR_Stats         MR_profile_parallel_executed_global_sparks = 
+        { 0, 0, 0, 0 };
+static MR_Stats         MR_profile_parallel_executed_contexts = { 0, 0, 0, 0 };
+static MR_Stats         MR_profile_parallel_executed_nothing = { 0, 0, 0, 0 };
+/* This cannot be static as it is used in macros by other modules. */
+MR_Stats                MR_profile_parallel_executed_local_sparks = 
+        { 0, 0, 0, 0 };
 static MR_Integer       MR_profile_parallel_contexts_created_for_sparks = 0;
 
 /*
-** Write out the profiling data that we collect during exceution.
+** We don't access these atomically.  They are protected by the free context
+** list lock
+*/
+static MR_Integer       MR_profile_parallel_small_context_reused = 0;
+static MR_Integer       MR_profile_parallel_regular_context_reused = 0;
+static MR_Integer       MR_profile_parallel_small_context_kept = 0;
+static MR_Integer       MR_profile_parallel_regular_context_kept = 0;
+
+/*
+** Write out the profiling data that we collect during execution.
 */
 static void
 MR_write_out_profiling_parallel_execution(void);
@@ -161,15 +179,18 @@ MR_finalize_thread_stuff(void)
 }
 
 #if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
+static int
+fprint_stats(FILE *stream, const char *message, MR_Stats *stats);
+
 /*
- * Write out the profiling data for parallel execution.
- *
- * This writes out a flat text file which may be parsed by a machine or easily
- * read by a human.  There is no advantage in using a binary format since we
- * do this once at the end of execution and it's a small amount of data.
- * Therefore a text file is used since it has the advantage of being human
- * readable.
- */
+** Write out the profiling data for parallel execution.
+**
+** This writes out a flat text file which may be parsed by a machine or easily
+** read by a human.  There is no advantage in using a binary format since we
+** do this once at the end of execution and it's a small amount of data.
+** Therefore a text file is used since it has the advantage of being human
+** readable.
+*/
 static void
 MR_write_out_profiling_parallel_execution(void)
 {
@@ -182,14 +203,45 @@ MR_write_out_profiling_parallel_executio
     result = fprintf(file, "Mercury parallel execution profiling data\n\n");
     if (result < 0) goto Error;
 
-    result = fprintf(file, "Global sparks executed: %d\n",
-        MR_profile_parallel_executed_global_sparks); 
+    result = fprint_stats(file, "Global sparks executed",
+        &MR_profile_parallel_executed_global_sparks); 
+    if (result < 0) goto Error;
+
+    result = fprint_stats(file, "Global contexts executed",
+        &MR_profile_parallel_executed_contexts);
+    if (result < 0) goto Error;
+
+    result = fprint_stats(file, "MR_do_runnext executed nothing",
+        &MR_profile_parallel_executed_nothing);
+    if (result < 0) goto Error;
+
+    result = fprint_stats(file, "Local sparks executed",
+        &MR_profile_parallel_executed_local_sparks);
     if (result < 0) goto Error;
 
     result = fprintf(file, "Contexts created for global spark execution: %d\n",
         MR_profile_parallel_contexts_created_for_sparks);
     if (result < 0) goto Error;
 
+    result = fprintf(file, "Number of times a small context was reused: %d\n",
+        MR_profile_parallel_small_context_reused);
+    if (result < 0) goto Error;
+    
+    result = fprintf(file, i
+            "Number of times a regular context was reused: %d\n",
+        MR_profile_parallel_regular_context_reused);
+    if (result < 0) goto Error;
+
+    result = fprintf(file, 
+            "Number of times a small context was kept for later use: %d\n",
+        MR_profile_parallel_small_context_kept);
+    if (result < 0) goto Error;
+    
+    result = fprintf(file, 
+            "Number of times a regular context was kept for later use: %d\n",
+        MR_profile_parallel_regular_context_kept);
+    if (result < 0) goto Error;
+
     if (0 != fclose(file)) goto Error;
 
     return;
@@ -198,6 +250,41 @@ MR_write_out_profiling_parallel_executio
         perror(MR_PROFILE_PARALLEL_EXECUTION_FILENAME);
         abort();
 }
+
+static int 
+fprint_stats(FILE *stream, const char *message, MR_Stats *stats) {
+    MR_Unsigned     count;
+    double          average;
+    double          sum_squared_over_n;
+    double          standard_deviation;
+
+    count = stats->MR_stat_count_recorded + stats->MR_stat_count_not_recorded;
+    
+    if (stats->MR_stat_count_recorded > 1)
+    {
+        average = (double)stats->MR_stat_sum /
+            (double)stats->MR_stat_count_recorded;
+        sum_squared_over_n = pow((double)stats->MR_stat_sum,2.0)/
+            (double)stats->MR_stat_count_recorded;
+        standard_deviation = 
+            sqrt(((double)stats->MR_stat_sum_squares - sum_squared_over_n) / 
+            (double)(stats->MR_stat_count_recorded - 1));
+
+        return fprintf(stream, 
+            "%s: count %d (%dr, %dnr), average %f, standard deviation %f\n",
+            message, count, stats->MR_stat_count_recorded, 
+            stats->MR_stat_count_not_recorded, average, standard_deviation);
+    } else if (stats->MR_stat_count_recorded == 1) {
+        return fprintf(stream, "%s: count %d (%dr, %dnr), sample %d\n",
+            message, count, stats->MR_stat_count_recorded, 
+            stats->MR_stat_count_not_recorded, stats->MR_stat_sum); 
+    } else {
+        return fprintf(stream, "%s: count %d (%dr, %dnr)\n",
+            message, count, stats->MR_stat_count_recorded, 
+            stats->MR_stat_count_not_recorded);
+    }
+};
+
 #endif
 
 static void 
@@ -396,9 +483,19 @@ MR_create_context(const char *id, MR_Con
     if (ctxt_size == MR_CONTEXT_SIZE_SMALL && free_small_context_list) {
         c = free_small_context_list;
         free_small_context_list = c->MR_ctxt_next;
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
+        if (MR_profile_parallel_execution) {
+            MR_profile_parallel_small_context_reused++;
+        }
+#endif
     } else if (free_context_list != NULL) {
         c = free_context_list;
         free_context_list = c->MR_ctxt_next;
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
+        if (MR_profile_parallel_execution) {
+            MR_profile_parallel_regular_context_reused++;
+        }
+#endif
     } else {
         c = NULL;
     }
@@ -452,10 +549,20 @@ MR_destroy_context(MR_Context *c)
         case MR_CONTEXT_SIZE_REGULAR:
             c->MR_ctxt_next = free_context_list;
             free_context_list = c;
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
+            if (MR_profile_parallel_execution) {
+                MR_profile_parallel_regular_context_kept++;
+            }
+#endif
             break;
         case MR_CONTEXT_SIZE_SMALL:
             c->MR_ctxt_next = free_small_context_list;
             free_small_context_list = c;
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
+            if (MR_profile_parallel_execution) {
+                MR_profile_parallel_small_context_kept++;
+            }
+#endif
             break;
     }
     MR_UNLOCK(&free_context_list_lock, "destroy_context");
@@ -628,6 +735,12 @@ MR_define_entry(MR_do_runnext);
     unsigned        depth;
     MercuryThread   thd;
 
+#ifdef MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
+    MR_Timer        runnext_timer;
+    if (MR_profile_parallel_execution) {
+        MR_profiling_start_timer(&runnext_timer);
+    }
+#endif
     /*
     ** If this engine is holding onto a context, the context should not be
     ** in the middle of running some code.
@@ -689,6 +802,12 @@ MR_define_entry(MR_do_runnext);
         }
 
         /* Nothing to do, go back to sleep. */
+#ifdef MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
+        if (MR_profile_parallel_execution) {
+            MR_profiling_stop_timer(&runnext_timer, 
+                    &MR_profile_parallel_executed_nothing);
+        }
+#endif
         while (MR_WAIT(&MR_runqueue_cond, &MR_runqueue_lock) != 0) {
         }
     }
@@ -709,6 +828,12 @@ MR_define_entry(MR_do_runnext);
     if (MR_ENGINE(MR_eng_this_context) != NULL) {
         MR_destroy_context(MR_ENGINE(MR_eng_this_context));
     }
+#ifdef MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
+    if (MR_profile_parallel_execution) {
+        MR_profiling_stop_timer(&runnext_timer, 
+                &MR_profile_parallel_executed_contexts);
+    }
+#endif
     MR_ENGINE(MR_eng_this_context) = tmp;
     MR_load_context(tmp);
     MR_GOTO(tmp->MR_ctxt_resume);
@@ -724,18 +849,20 @@ MR_define_entry(MR_do_runnext);
         MR_load_context(MR_ENGINE(MR_eng_this_context));
 #ifdef MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
         if (MR_profile_parallel_execution) {
-            MR_atomic_inc_int(&MR_profile_parallel_contexts_created_for_sparks);
+            MR_atomic_inc_int(
+                    &MR_profile_parallel_contexts_created_for_sparks);
         }
 #endif
     }
+    MR_parent_sp = spark.MR_spark_parent_sp;
+    MR_assert(MR_parent_sp != MR_sp);
+    MR_SET_THREAD_LOCAL_MUTABLES(spark.MR_spark_thread_local_mutables);
 #ifdef MR_PROFILE_PARALLEL_EXECUTION_SUPPORT
     if (MR_profile_parallel_execution) {
-        MR_atomic_inc_int(&MR_profile_parallel_executed_global_sparks);
+        MR_profiling_stop_timer(&runnext_timer, 
+                &MR_profile_parallel_executed_global_sparks);
     }
 #endif
-    MR_parent_sp = spark.MR_spark_parent_sp;
-    MR_assert(MR_parent_sp != MR_sp);
-    MR_SET_THREAD_LOCAL_MUTABLES(spark.MR_spark_thread_local_mutables);
     MR_GOTO(spark.MR_spark_resume);
 }
 #else /* !MR_THREAD_SAFE */
Index: runtime/mercury_context.h
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_context.h,v
retrieving revision 1.50
retrieving revision 1.52
diff -u -p -b -r1.50 -r1.52
--- runtime/mercury_context.h	13 Jul 2009 05:27:12 -0000	1.50
+++ runtime/mercury_context.h	16 Aug 2009 10:47:55 -0000	1.52
@@ -348,6 +348,18 @@ extern      MR_Context  *MR_runqueue_tai
 
 #if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT) 
 extern MR_bool  MR_profile_parallel_execution;
+
+extern MR_Stats     MR_profile_parallel_executed_local_sparks;
+
+#define MR_IF_PROFILE_PARALLEL_EXECUTION_SUPPORT(statement)                 \
+    do {                                                                    \
+        statement;                                                          \
+    } while (0);
+
+#else
+
+#define MR_IF_PROFILE_PARALLEL_EXECUTION_SUPPORT(statement)
+
 #endif
 
 /*
@@ -763,7 +775,8 @@ extern  void        MR_schedule_context(
 
   #define MR_fork_globally_criteria                                           \
     (MR_num_idle_engines != 0 &&                                              \
-    MR_num_outstanding_contexts_and_global_sparks < MR_max_outstanding_contexts)
+     MR_num_outstanding_contexts_and_global_sparks <                          \
+            MR_max_outstanding_contexts)
 
   /*
   ** These macros may be used as conditions for runtime parallelism decisions.
@@ -792,6 +805,12 @@ extern  void        MR_schedule_context(
   #define MR_join_and_continue(sync_term, join_label)                         \
     do {                                                                      \
         MR_SyncTerm *jnc_st = (MR_SyncTerm *) &sync_term;                     \
+MR_IF_PROFILE_PARALLEL_EXECUTION_SUPPORT(                                     \
+            MR_Timer MR_local_spark_timer;                                    \
+            if (MR_profile_parallel_execution == MR_TRUE) {                   \
+                MR_profiling_start_timer(&MR_local_spark_timer);              \
+            }                                                                 \
+        );                                                                    \
                                                                               \
         if (!jnc_st->MR_st_is_shared) {                                       \
             /* This parallel conjunction has only executed sequentially. */   \
@@ -843,6 +862,11 @@ extern  void        MR_schedule_context(
             &jnc_spark);                                                      \
         if (jnc_popped) {                                                     \
             MR_atomic_dec_int(&MR_num_outstanding_contexts_and_all_sparks);   \
+MR_IF_PROFILE_PARALLEL_EXECUTION_SUPPORT(                                     \
+            if (MR_profile_parallel_execution == MR_TRUE) {                   \
+                MR_profiling_stop_timer(&MR_local_spark_timer,                \
+                    &MR_profile_parallel_executed_local_sparks);              \
+            });                                                               \
             MR_GOTO(jnc_spark.MR_spark_resume);                               \
         } else {                                                              \
             MR_runnext();                                                     \
@@ -867,6 +891,11 @@ extern  void        MR_schedule_context(
             */                                                                \
             MR_UNLOCK(&MR_sync_term_lock, "continue_2 i");                    \
             MR_atomic_dec_int(&MR_num_outstanding_contexts_and_all_sparks);   \
+MR_IF_PROFILE_PARALLEL_EXECUTION_SUPPORT(                                     \
+            if (MR_profile_parallel_execution == MR_TRUE) {                   \
+                MR_profiling_stop_timer(&MR_local_spark_timer,                \
+                    &MR_profile_parallel_executed_local_sparks);              \
+            });                                                               \
             MR_GOTO(jnc_spark.MR_spark_resume);                               \
         } else {                                                              \
             /*                                                                \
Index: runtime/mercury_wrapper.c
===================================================================
RCS file: /home/mercury1/repository/mercury/runtime/mercury_wrapper.c,v
retrieving revision 1.197
diff -u -p -b -r1.197 mercury_wrapper.c
--- runtime/mercury_wrapper.c	13 Jul 2009 05:27:12 -0000	1.197
+++ runtime/mercury_wrapper.c	16 Aug 2009 05:55:24 -0000
@@ -521,6 +521,16 @@ mercury_runtime_init(int argc, char **ar
     }
 #endif
 
+#if defined(MR_THREAD_SAFE) && defined(MR_PROFILE_PARALLEL_EXECUTION_SUPPORT)
+    /*
+    ** Setup support for reading the CPU's TSC.  This is currently used by
+    ** profiling of the parallelism runtime but may be used by other profiling
+    ** or timing code.  On architectures other than i386 and amd64 this is a
+    ** no-op.
+    */
+    MR_configure_profiling_timers();
+#endif 
+
     /*
     ** This must be done before MR_init_conservative_GC(),
     ** to ensure that the GC's signal handler gets installed
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 481 bytes
Desc: Digital signature
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20090816/0ec62ef6/attachment.sig>
Previous message: [m-rev.] diff: allow testing of java grade
Next message: [m-rev.] For review: Parallel runtime profiling improvments.
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the reviews mailing list