[m-rev.] For post-commit review: Automatic parallelisation improvements.

Paul Bone pbone at csse.unimelb.edu.au
Tue Jun 8 22:57:53 AEST 2010


For post-commit review.  Probably by Zoltan.


Automatic parallelisation improvements.

The automatic parallelisation analysis now searches for the best way to
parallelise a conjunction when it's deciding if parallelisation is worth-while.
This mostly affects unifications and other cheap goals between two calls being
parallelised; it decides which parallel conjunct each of these goals should be
placed in.  The search used is a branch and bound search.

deep_profiler/mdprof_fb.automatic_parallelism.m:
    As above,

    find_costly_call now returns find_costly_call_result rather than using the
    maybe type.  Instantiation sub-typing is used to guarantee that the goal
    returned is indeed a costly call.

    Fix a bug in calculate_dependant_parallel_cost.

    When printing the candidate parallel conjunction report include the goals
    before and after a parallel conjunction.

    Document the debug_parallel_conjunction_speedup trace flag.
    
    Create a new type parallelise_dep_conjs rather than using a boolean to
    represent this option.

    Add a new trace flag, debug_branch_and_bound enables trace goals that can
    be used to debug the branch and bound search for the best parallelisation.

deep_profiler/mdprof_feedback.m:
    When printing the candidate parallel conjunction report remove some
    vertical whitespace after the header of the report. 

deep_profiler/Mercury.options:
    Add a new trace flag, debug_branch_and_bound enables trace goals that can
    be used to debug the branch and bound search for the best parallelisation.
    This is commented out.

mdbcomp/feedback.m:
    Include the goals before and after a parallel conjunction in the
    candidate_par_conjunction type.

    Modify the parallel_exec_metrics code so that it can handle the cost of the
    goals before and after a parallel conjunction.

    Increment the feedback file format version.

    When reading a feedback file strip the program name of leading and trailing
    whitespace.

    When printing a message for the incorrect_program_name error enclose the
    program names in quotes.

    Conform to changes in mdprof_fb.automatic_parallelism.m

Index: deep_profiler/Mercury.options
===================================================================
RCS file: /home/mercury1/repository/mercury/deep_profiler/Mercury.options,v
retrieving revision 1.13
diff -u -p -b -r1.13 Mercury.options
--- deep_profiler/Mercury.options	30 Apr 2010 13:09:54 -0000	1.13
+++ deep_profiler/Mercury.options	8 Jun 2010 12:42:27 -0000
@@ -40,4 +40,5 @@ MCFLAGS-top_procs = 	--no-optimize-dupli
 #MCFLAGS-mdprof_fb.automatic_parallelism = \
 #	--trace-flag=debug_cpc_search \
 #	--trace-flag=debug_recursive_costs \
-#	--trace-flag=debug_parallel_conjunction_speedup
+#	--trace-flag=debug_parallel_conjunction_speedup \
+#	--trace-flag=debug_branch_and_bound
Index: deep_profiler/mdprof_fb.automatic_parallelism.m
===================================================================
RCS file: /home/mercury1/repository/mercury/deep_profiler/mdprof_fb.automatic_parallelism.m,v
retrieving revision 1.7
diff -u -p -b -r1.7 mdprof_fb.automatic_parallelism.m
--- deep_profiler/mdprof_fb.automatic_parallelism.m	30 Apr 2010 13:09:54 -0000	1.7
+++ deep_profiler/mdprof_fb.automatic_parallelism.m	8 Jun 2010 12:50:11 -0000
@@ -24,7 +24,6 @@
 :- import_module mdbcomp.feedback.
 :- import_module mdbcomp.program_representation.
 
-:- import_module bool.
 :- import_module cord.
 :- import_module int.
 :- import_module float.
@@ -53,9 +52,13 @@
                 cpc_locking_cost            :: int,
                 cpc_clique_threshold        :: int,
                 cpc_call_site_threshold     :: int,
-                cpc_parallelise_dep_conjs   :: bool
+                cpc_parallelise_dep_conjs   :: parallelise_dep_conjs 
             ).
     
+:- type parallelise_dep_conjs
+    --->    parallelise_dep_conjs
+    ;       do_not_parallelise_dep_conjs.
+
 %-----------------------------------------------------------------------------%
     
     % Perform Jerome's analysis and update the feedback info structure.
@@ -96,6 +99,7 @@
 :- import_module map.
 :- import_module maybe.
 :- import_module multi_map.
+:- import_module mutvar.
 :- import_module pqueue.
 :- import_module require.
 :- import_module set.
@@ -112,6 +116,12 @@
 %	--trace-flag=debug_recursive_costs 
 %     Debug the calculation of the costs of recursive call sites.
 %
+%   --trace-flag=debug_parallel_conjunction_speedup
+%     Print candidate parallel conjunctions that are pessimisations.
+%
+%   --trace-flag=debug_branch_and_bound
+%     Debug the branch and bound search for the best parallelisation.
+%
 
 candidate_parallel_conjunctions(Opts, Deep, Messages, !Feedback) :-
     Opts = candidate_parallel_conjunctions_opts(DesiredParallelism,
@@ -193,6 +203,9 @@ pard_goal_detail_to_pard_goal(pard_goal_
                     % The inst map info attached to the original goal.
             ).
 
+:- inst pard_goal_detail(T)
+    ---> pard_goal_detail(T, ground, ground, ground).
+
 :- type pard_goal_type 
     --->    pgt_call(
                 pgtc_callee             :: callee_rep,
@@ -1384,8 +1397,6 @@ partition_pard_goals(Location, [ PG | PG
 
 pardgoals_build_candidate_conjunction(Info, DependencyMaps, Location, GoalPath,
         GoalsPartition, MaybeCandidate) :-
-    ParalleliseDependantConjs = Info ^ ipi_opts ^ cpc_parallelise_dep_conjs,
-
     % Setting up the first parallel conjunct is a different algorithm to the
     % latter ones, at this point we have the option of moving goals from before
     % the first costly call to either before or during the parallel
@@ -1395,64 +1406,21 @@ pardgoals_build_candidate_conjunction(In
     % conjunction dependant when it could be independent.
     pard_goals_partition(GoalsList, PartNum) = GoalsPartition,
     Goals = cord.from_list(GoalsList),
-    find_costly_call(Goals, cord.empty, GoalsBefore0, MaybeFirstCall,
-        GoalsDuringAfter),
-    (
-        MaybeFirstCall = yes(FirstCall),
-        FirstCallType = FirstCall ^ pgd_pg_type,
+    find_best_parallelisation(Info, Location, PartNum, DependencyMaps,
+        Goals, Parallelisation),
         (
-            FirstCallType = pgt_call(_, _, _, _, _, FirstCallCallSite)
-        ;
-            ( FirstCallType = pgt_cheap_call(_, _, _, _, _)
-            ; FirstCallType = pgt_other_atomic_goal
-            ; FirstCallType = pgt_non_atomic_goal
-            ),
-            location_to_string(1, Location, LocationString),
-            Error = singleton(this_file) ++ singleton("\n") ++
-                LocationString ++
-                nl_indent(1) ++ singleton(format("partition %d", [i(PartNum)]))
-                ++ nl_indent(1) ++ 
-                    singleton("First call in conjunction is not a call\n"),
-            error(append_list(cord.list(Error)))
-        )
-    ;
-        MaybeFirstCall = no,
-        location_to_string(1, Location, LocationString),
-        Error = singleton(this_file) ++ singleton("\n") ++
-            LocationString ++
-            nl_indent(1) ++ singleton(format("partition %d", [i(PartNum)])) ++
-            nl_indent(1) ++ singleton("Couldn't find first call\n"),
-        error(append_list(cord.list(Error)))
-    ),
-    build_first_seq_conjunction(DependencyMaps, 
-        FirstCall ^ pgd_conj_num, length(GoalsBefore0),
-        length(GoalsDuringAfter), GoalsBefore0,
-        cord.singleton(FirstCall), FirstSeqConjunction0, _GoalsBefore),
-    ( get_first(FirstSeqConjunction0, FirstConj) ->
-        FirstConjNumOfFirstParConjunct = FirstConj ^ pgd_conj_num
+        Parallelisation = bp_no_best_parallelisation,
+        MaybeCandidate = no
     ;
-        error(this_file ++ "Found empty first parallel conjunct")
-    ),
-    pardgoals_build_par_conjs(GoalsDuringAfter, DependencyMaps,
-        FirstConjNumOfFirstParConjunct, FirstSeqConjunction0, _GoalsAfter,
-        cord.empty, ParConjsCord, conjuncts_are_independent, IsDependant),
-    
-    (
-        not ( 
-            IsDependant = conjuncts_are_dependant,
-            ParalleliseDependantConjs = no
-        )
-    ->
-        calculate_parallel_cost(Info, ParConjsCord, ParMetricsIncomp),
-        NumCalls = FirstCallCallSite ^ ccsr_call_site_summary ^
-            perf_row_calls,
-        ParExecMetrics = 
-            finalise_parallel_exec_metrics(ParMetricsIncomp, NumCalls),
-        Speedup = parallel_exec_metrics_get_speedup(ParExecMetrics),
-        ParConjs = list(ParConjsCord),
-        Candidate = candidate_par_conjunction(
-            goal_path_to_string(GoalPath), PartNum, IsDependant,
-            ParConjs, ParExecMetrics),
+        Parallelisation = bp_parallel_execution(GoalsBeforeCord, ParConjsCord,
+            GoalsAfterCord, IsDependent, Metrics),
+        Speedup = parallel_exec_metrics_get_speedup(Metrics),
+        GoalsBefore = list(GoalsBeforeCord),
+        GoalsAfter = list(GoalsAfterCord),
+        ParConjs = map((func(CordI) = seq_conj(list(CordI))),
+            list(ParConjsCord)),
+        Candidate = candidate_par_conjunction(goal_path_to_string(GoalPath),
+            PartNum, IsDependent, GoalsBefore, ParConjs, GoalsAfter, Metrics),
         ( Speedup > 1.0 ->
             MaybeCandidate = yes(Candidate)
         ;
@@ -1475,199 +1443,291 @@ pardgoals_build_candidate_conjunction(In
                 create_candidate_parallel_conj_report(ProcLabel - FBCandidate, 
                     Report),
                 io.write_string("Not parallelising conjunction, " ++ 
-                    "insufficient speedup:", !IO),
-                io.write_string(append_list(cord.list(Report)), !IO),
-                io.nl(!IO)
+                    "insufficient speedup:\n", !IO),
+                io.write_string(append_list(cord.list(Report)), !IO)
             )
         )
-    ;
-        MaybeCandidate = no
     ).
 
-:- pred build_first_seq_conjunction(dependency_maps::in, 
-    int::in, int::in, int::in, cord(pard_goal_detail)::in, 
-    cord(pard_goal_detail)::in, cord(pard_goal_detail)::out,
-    cord(pard_goal_detail)::out) is det. 
-
-build_first_seq_conjunction(DependencyMaps, CallConjNum, NumGoalsBefore,
-        NumGoalsAfter, Goals0, !FirstSeqConjunction, GoalsBefore) :-
-    % Move over goals in reverse order.
-    ( split_last(Goals0, Goals, Goal) ->
-        ConjNum = Goal ^ pgd_conj_num,
-        depends_lookup_rev(DependencyMaps, ConjNum, GoalRevDeps),
-        depends_lookup_rev(DependencyMaps, CallConjNum, CallRevDeps),
-        (
-            % If later goals depend on this goal but not on the first call.
-            member(LaterGoalNum, (CallConjNum+1)..(CallConjNum+NumGoalsAfter)),
-            (
-                member(LaterGoalNum, GoalRevDeps)
-            =>
-                not member(LaterGoalNum, CallRevDeps)
-            )
-        ->
-            % Then this goal and all those before it must be in GoalsBefore.
-            % This can be pessimistic but we're not allowed to re-order goals.
-            GoalsBefore = Goals0
-        ;
-            % This goal may be parallelised as part of the first conjunction.
-            !:FirstSeqConjunction = cons(Goal, !.FirstSeqConjunction),
-            build_first_seq_conjunction(DependencyMaps, CallConjNum,
-                NumGoalsBefore, NumGoalsAfter, Goals, !FirstSeqConjunction,
-                GoalsBefore)
+:- type find_costly_call_result
+    --->    costly_call_found(
+                fccrfc_goals_before     :: cord(pard_goal_detail),
+                fccrfc_call             :: pard_goal_detail,
+                fccrfc_gaols_after      :: cord(pard_goal_detail)
         )
-    ;
-        GoalsBefore = cord.empty
-    ).
+    ;       costly_call_not_found.
 
-:- pred pardgoals_build_par_conjs(cord(pard_goal_detail)::in,
-    dependency_maps::in, int::in,
-    cord(pard_goal_detail)::in, cord(pard_goal_detail)::out,
-    cord(seq_conj(pard_goal_detail))::in,
-    cord(seq_conj(pard_goal_detail))::out,
-    conjuncts_are_dependant::in, conjuncts_are_dependant::out) is det.
-
-pardgoals_build_par_conjs(Goals0, DependencyMaps,
-        FirstConjNumOfFirstParConjunct, CurSeqConjunction0, GoalsAfter,
-        !ParConjs, !ConjsAreDependant) :-
-    % Find the next costly call.
-    find_costly_call(Goals0, cord.empty, GoalsBefore, MaybeNextCall, 
-        Goals),
-    (
-        MaybeNextCall = yes(NextCall),
-       
-        % XXX: We put all the goals between two paralleliseable goals in the
-        % parallel conjunct with the second goal.  This seems to work as often
-        % small unifications occurring before the next goal are used to prepare
-        % some arguments for it.
-        CurSeqConjunction = CurSeqConjunction0,
-        NewSeqConjunction = snoc(GoalsBefore, NextCall),
-      
-        % Has the conjunction become dependant.
-        map_pred(pg_get_conj_num, CurSeqConjunction, CurSeqConjNumsCord),
-        CurSeqConjNumsSet = set(list(CurSeqConjNumsCord)),
-        map_pred(pg_get_conj_num, NewSeqConjunction, NewSeqConjNumsCord),
-        (
-            !.ConjsAreDependant = conjuncts_are_independent,
-            member(NewSeqConjNum, NewSeqConjNumsCord),
-            depends_lookup(DependencyMaps, NewSeqConjNum, Dependencies),
-            intersect(Dependencies, CurSeqConjNumsSet, Intersection),
-            not set.empty(Intersection)
-        ->
-            !:ConjsAreDependant = conjuncts_are_dependant
-        ;
-            true
-        ),
+:- inst find_costly_call_result
+    --->    costly_call_found(ground, pard_goal_detail(pgt_call), ground)
+    ;       costly_call_not_found.
 
-        SeqConjunction = seq_conj(list(CurSeqConjunction)),
-        !:ParConjs = snoc(!.ParConjs, SeqConjunction),
-        pardgoals_build_par_conjs(Goals, DependencyMaps,
-            FirstConjNumOfFirstParConjunct, NewSeqConjunction, GoalsAfter,
-            !ParConjs, !ConjsAreDependant)
-    ;
-        MaybeNextCall = no,
-        ( cord.get_first(CurSeqConjunction0, FirstGoalInLastConjunct) ->
-            FirstConjNumOfLastParConjunct = 
-                FirstGoalInLastConjunct ^ pgd_conj_num
-        ; 
-            error(this_file ++ " empty parallel conjunct")
-        ),
-        % Because this is the last parallel conjunction we have the
-        % option of putting some remaining goals into the parallel conjunction
-        % as part of the last parallel conjunct.
-        sorted_list_to_set(
-            FirstConjNumOfFirstParConjunct..(FirstConjNumOfLastParConjunct-1),
-            GoalsInOtherParConjuncts),
-        build_last_seq_conjunction(Goals0, DependencyMaps,
-            GoalsInOtherParConjuncts, GoalsAfter, 
-            cord.empty, GoalsInLastConjunct),
-        SeqConjunction = seq_conj(list(
-            CurSeqConjunction0 ++ GoalsInLastConjunct)),
-        !:ParConjs = snoc(!.ParConjs, SeqConjunction)
-    ).
-
-:- pred find_costly_call(cord(pard_goal_detail)::in,
-    cord(pard_goal_detail)::in, cord(pard_goal_detail)::out,
-    maybe(pard_goal_detail)::out,
-    cord(pard_goal_detail)::out) is det.
+:- pred find_costly_call(cord(pard_goal_detail)::in, cord(pard_goal_detail)::in,
+    find_costly_call_result::out(find_costly_call_result)) is det.
 
-find_costly_call(Goals, !GoalsBefore, MaybeCall, GoalsAfter) :-
+find_costly_call(Goals, GoalsBefore0, Result) :-
     ( head_tail(Goals, Goal, GoalsTail) ->
         GoalType = Goal ^ pgd_pg_type,
         ( 
             GoalType = pgt_call(_, _, _, _, _, _),
-            MaybeCall = yes(Goal),
-            GoalsAfter = GoalsTail
+            Result = costly_call_found(GoalsBefore0, Goal, GoalsTail)
         ;
             ( GoalType = pgt_cheap_call(_, _, _, _, _)
             ; GoalType = pgt_other_atomic_goal
             ),
-            !:GoalsBefore = snoc(!.GoalsBefore, Goal),
-            find_costly_call(GoalsTail, !GoalsBefore, MaybeCall, GoalsAfter)
+            GoalsBefore = snoc(GoalsBefore0, Goal),
+            find_costly_call(GoalsTail, GoalsBefore, Result)
         ;
             GoalType = pgt_non_atomic_goal,
             error(this_file ++ "Found non-atomic goal")
         )
     ;
-        MaybeCall = no,
-        GoalsAfter = cord.empty
+        Result = costly_call_not_found
     ).
 
-:- pred build_last_seq_conjunction(cord(pard_goal_detail)::in, 
-    dependency_maps::in, set(int)::in, cord(pard_goal_detail)::out,
-    cord(pard_goal_detail)::in, cord(pard_goal_detail)::out) is det.
-
-build_last_seq_conjunction(Goals0, DependencyMaps, GoalsInOtherParConjuncts, 
-        GoalsAfter, !GoalsInLastConjunct) :-
-    ( head_tail(Goals0, Goal, Goals) ->
-        ConjNum = Goal ^ pgd_conj_num,
-        depends_lookup(DependencyMaps, ConjNum, Dependencies),
-        intersect(Dependencies, GoalsInOtherParConjuncts, Intersection),
-        (
-            % The goal is dependant apon a goal in a previous parallel conjunct.
-            not set.empty(Intersection) => 
-            % But not dependant upon a goal in the current parallel conjunct.
-            (
-                map_pred(pg_get_conj_num, !.GoalsInLastConjunct, 
-                    ConjNumsInLastConjunct),
-                intersect(Dependencies, set(list(ConjNumsInLastConjunct)),
-                    Intersection2),
-                set.empty(Intersection2)
+:- type best_parallelisation
+    --->    bp_parallel_execution(
+                bp_goals_before         :: cord(pard_goal_detail),
+                bp_par_conjs            :: cord(cord(pard_goal_detail)),
+                bp_goals_after          :: cord(pard_goal_detail),
+                bp_is_dependent         :: conjuncts_are_dependant,
+                bp_par_exec_metrics     :: parallel_exec_metrics
+            )
+                % Rather than using the sequential execution as an initial 
+                % value for the branch and bound search we use this value.
+                % This allows us to report the best parallelisation even when
+                % sequential execution is optimal.
+    ;       bp_no_best_parallelisation.
+
+:- type incomplete_parallelisation
+    --->    incomplete_parallelisation(
+                ip_goals_before         :: cord(pard_goal_detail),
+                ip_par_conjs            :: cord(cord(pard_goal_detail)),
+                ip_par_exec_overlap     :: parallel_execution_overlap,
+                ip_par_exec_metrics     :: parallel_exec_metrics_incomplete,
+                ip_is_outermost_conj    :: is_outermost_conjunct,
+                ip_can_make_cheap_conj  :: can_make_cheap_conj
+            ).
+
+:- type can_make_cheap_conj
+    --->    can_make_cheap_conj
+    ;       cannot_make_cheap_conj.
+
+:- pred find_best_parallelisation(implicit_parallelism_info::in, 
+    program_location::in, int::in, dependency_maps::in, 
+    cord(pard_goal_detail)::in, best_parallelisation::out) is det.
+
+find_best_parallelisation(Info, Location, PartNum, DependencyMaps, Goals,
+        FinalBestParallelisation) :-
+    % Use a failure driven loop to implement a branch and bound solver.
+    promise_pure (
+        trace [compile_time(flag("debug_branch_and_bound")), io(!IO)] (
+            io.format("D: Branch and bound loop starting:\n%s\n",
+                [s(LocationString)], !IO),
+            location_to_string(1, Location, LocationCord),
+            string.append_list(list(LocationCord), LocationString)
+        ),
+        impure new_mutvar(bp_no_best_parallelisation,
+            BestParallelisationMutvar),
+        (
+            impure find_best_parallelisation_nondet(Info, Location, PartNum, 
+                DependencyMaps, Goals, BestParallelisationMutvar, 
+                BestParallelisation),
+            % this is the best parallelisation so far.
+            impure set_mutvar(BestParallelisationMutvar, BestParallelisation),
+            trace [compile_time(flag("debug_branch_and_bound")), io(!IO)] (
+                (
+                    BestParallelisation = bp_no_best_parallelisation
+                ;
+                    BestParallelisation = bp_parallel_execution(_, _, _, _,
+                        Metrics),
+                    io.format("D: Branch and bound: Solution par time: %f\n",
+                        [f(ParTime)], !IO),
+                    ParTime = parallel_exec_metrics_get_par_time(Metrics)
             )
+            ),
+            semidet_fail
         ->
-            % This goal and all those after it must be placed after the
-            % conjunction.
-            GoalsAfter = Goals0
+            error("Failure driven loop must fail")
         ;
-            % This goal does not depend on goals in other parallel conjuncts,
-            % we can parallelise it here without introducing an extra future.
-            !:GoalsInLastConjunct = snoc(!.GoalsInLastConjunct, Goal),
-            build_last_seq_conjunction(Goals, DependencyMaps,
-                GoalsInOtherParConjuncts, GoalsAfter, !GoalsInLastConjunct)
+            impure get_mutvar(BestParallelisationMutvar, 
+                FinalBestParallelisation),
+            trace [compile_time(flag("debug_branch_and_bound")), io(!IO)] (
+                io.write_string("D: Branch and bound loop finished\n", !IO)
+            )
         )
-    ;
-        GoalsAfter = cord.empty
     ).
 
-    % calculate_dependant_parallel_cost(Conjs, ParExecMetrics).
-    %
-    % Analyse the parallel conjuncts and determine their likely performance.
-    %
-    % This is the new parallel execution overlap algorithm, it is general and
-    % therefore we also use is for independent conjunctions.
-    %
-:- pred calculate_parallel_cost(implicit_parallelism_info::in,
-    cord(seq_conj(pard_goal_detail))::in,
-    parallel_exec_metrics_incomplete::out) is det.
-
-calculate_parallel_cost(Info, Conjs, ParallelExecMetrics) :-
-    % We parepare the conjunctions before calculating the parallelisation cost.
-    % In doing this we create a left-associative data structure that makes the
-    % left-associative walk in the next step easier.  The data structure also
-    % contains extra information preventing quadratic code in the actual
-    % calculation.
-    calculate_parallel_cost_2(Info, Conjs, is_outermost_conjunct,
-        ParallelExecMetrics, _ParallelExecOverlap, _ParallelExec,
-        _NumConjuncts).
+:- impure pred find_best_parallelisation_nondet(implicit_parallelism_info::in,
+    program_location::in, int::in, dependency_maps::in, 
+    cord(pard_goal_detail)::in, 
+    mutvar(best_parallelisation)::in, best_parallelisation::out) is nondet.
+
+find_best_parallelisation_nondet(Info, Location, PartNum, DependencyMaps, Goals0, 
+        BestParallelisationMutvar, BestParallelisation) :-
+    find_costly_call(Goals0, cord.empty, FindCostlyCallResult),
+    (
+        FindCostlyCallResult = costly_call_found(GoalsBeforeFirstCall, 
+            FirstCall, GoalsAfterFirstCall)
+    ;
+        FindCostlyCallResult = costly_call_not_found,
+        location_to_string(1, Location, LocationString),
+        Error = singleton(this_file) ++ singleton("\n") ++
+            LocationString ++
+            nl_indent(1) ++ singleton(format("partition %d", [i(PartNum)])) ++
+            nl_indent(1) ++ singleton("Couldn't find first call\n"),
+        error(append_list(cord.list(Error)))
+    ),
+    FirstCallCallSite = FirstCall ^ pgd_pg_type ^ pgtc_call_site,
+    NumCalls = FirstCallCallSite ^ ccsr_call_site_summary ^
+        perf_row_calls,
+   
+    % Generate goals before conjunction.
+    cord_split(GoalsBeforeFirstCall, GoalsBeforeConj, 
+        GoalsBeforeCallInFirstConj),
+    Goals1 = GoalsBeforeCallInFirstConj ++ singleton(FirstCall) ++ 
+        GoalsAfterFirstCall,
+
+    foldl_pred(pardgoal_calc_cost, GoalsBeforeConj, 0.0, CostBeforeConj),
+    Metrics0 = init_empty_parallel_exec_metrics(CostBeforeConj),
+    Parallelisation0 = incomplete_parallelisation(GoalsBeforeConj, 
+        empty, peo_empty_conjunct, Metrics0, is_outermost_conjunct,
+        cannot_make_cheap_conj),
+    impure find_best_parallelisation_2(Info, DependencyMaps, 
+        BestParallelisationMutvar, 0, map.init, Goals1, 
+        GoalsAfterConj, Parallelisation0, Parallelisation1),
+
+    % Finalise the metrics and determine if this is the best solution so far.
+    foldl_pred(pardgoal_calc_cost, GoalsAfterConj, 0.0, CostAfterConj),
+    Metrics1 = Parallelisation1 ^ ip_par_exec_metrics,
+    SparkDelay = Info ^ ipi_opts ^ cpc_sparking_delay,
+    SparkCost = Info ^ ipi_opts ^ cpc_sparking_cost,
+    Metrics = finalise_parallel_exec_metrics(Metrics1, NumCalls, CostAfterConj, 
+        float(SparkDelay + SparkCost)),
+
+    impure get_mutvar(BestParallelisationMutvar, CurrentBestParallelisation),
+    cbmr_metrics_is_better = compare_best_parallelisation_and_metrics(
+        CurrentBestParallelisation, Metrics),
+
+    Conjuncts = Parallelisation1 ^ ip_par_conjs,
+    Overlap = Parallelisation1 ^ ip_par_exec_overlap,
+    par_conj_overlap_is_dependant(Overlap, IsDependent),
+    BestParallelisation = bp_parallel_execution(GoalsBeforeConj, Conjuncts,
+        GoalsAfterConj, IsDependent, Metrics).
+
+:- impure pred find_best_parallelisation_2(implicit_parallelism_info::in,
+    dependency_maps::in, mutvar(best_parallelisation)::in, 
+    int::in, map(var_rep, float)::in, 
+    cord(pard_goal_detail)::in, cord(pard_goal_detail)::out, 
+    incomplete_parallelisation::in, incomplete_parallelisation::out) is nondet. 
+
+find_best_parallelisation_2(Info, DependencyMaps, BestParallelisationMutvar, 
+        NumConjuncts0, ProductionsMap0, !Goals, !Parallelisation) :-
+    IsOutermostConjunct = !.Parallelisation ^ ip_is_outermost_conj,
+    find_costly_call(!.Goals, cord.empty, FindCostlyCallResult),
+    (
+        FindCostlyCallResult = 
+            costly_call_found(GoalsBefore0, Call, GoalsAfter0),
+        
+        (
+            can_make_cheap_conj = !.Parallelisation ^ ip_can_make_cheap_conj, 
+            find_costly_call(GoalsAfter0, cord.empty, 
+                costly_call_found(_, _, _)),
+            cord_split(GoalsBefore0, Conj, GoalsBefore),
+            !:Goals = snoc(GoalsBefore, Call) ++ GoalsAfter0,
+            !Parallelisation ^ ip_can_make_cheap_conj := cannot_make_cheap_conj
+        ;
+            % Determine how many goals to include after the call in this
+            % parallel conjunct.
+            cord_split(GoalsAfter0, GoalsAfter, !:Goals),
+            % Don't include two costly calls in the same parallel conjunct.
+            find_costly_call(GoalsAfter, cord.empty, costly_call_not_found),
+        
+            % Build the new conjunct and test to see if this is no worse than
+            % our best parallelisation.
+            Conj = snoc(GoalsBefore0, Call) ++ GoalsAfter,
+
+            % If we've found the last costly call then there are no more
+            % conjuncts to make and therefore we cannot make a cheap conjunct,
+            % otherwise we can make a cheap conjunct.
+            find_costly_call(!.Goals, cord.empty, Result2),
+            (
+                Result2 = costly_call_not_found,
+                !Parallelisation ^ ip_can_make_cheap_conj :=
+                    cannot_make_cheap_conj
+            ;
+                Result2 = costly_call_found(_, _, _),
+                !Parallelisation ^ ip_can_make_cheap_conj := can_make_cheap_conj
+            )
+        ),
+        not is_empty(Conj),
+        Conjs0 = !.Parallelisation ^ ip_par_conjs,
+        Conjs = snoc(Conjs0, Conj),
+        
+        Metrics0 = !.Parallelisation ^ ip_par_exec_metrics,
+        Overlap0 = !.Parallelisation ^ ip_par_exec_overlap,
+        calculate_parallel_cost_step(Info, IsOutermostConjunct, NumConjuncts0,
+            Conj, ProductionsMap0, ProductionsMap, Metrics0, Metrics, 
+            Overlap0, Overlap),
+        (
+            Overlap = peo_empty_conjunct,
+            DepVars = set.init
+        ;
+            Overlap = peo_conjunction(_, _, DepVars)
+        ),
+        ParalleliseDepConjs = Info ^ ipi_opts ^ cpc_parallelise_dep_conjs,
+        (
+            ParalleliseDepConjs = do_not_parallelise_dep_conjs
+        =>
+            set.empty(DepVars)
+        ),
+        SparkCost = Info ^ ipi_opts ^ cpc_sparking_cost,
+        SparkDelay = Info ^ ipi_opts ^ cpc_sparking_delay,
+        MetricsComplete = finalise_parallel_exec_metrics(Metrics, 1, 0.0, 
+            float(SparkDelay + SparkCost)),
+        impure get_mutvar(BestParallelisationMutvar, 
+            CurrentBestParallelisation),
+        cbmr_metrics_is_better = compare_best_parallelisation_and_metrics(
+            CurrentBestParallelisation, MetricsComplete), 
+
+        NumConjuncts = NumConjuncts0 + 1,
+        !Parallelisation ^ ip_par_exec_overlap := Overlap,
+        !Parallelisation ^ ip_par_exec_metrics := Metrics,
+        !Parallelisation ^ ip_par_conjs := Conjs,
+        !Parallelisation ^ ip_is_outermost_conj := is_not_outermost_conjunct,
+
+        impure find_best_parallelisation_2(Info, DependencyMaps, 
+            BestParallelisationMutvar, 
+            NumConjuncts, ProductionsMap, !Goals, !Parallelisation)
+    ;
+        FindCostlyCallResult = costly_call_not_found
+    ).
+
+:- type compare_best_and_metrics_result
+    --->    cbmr_metrics_is_better
+    ;       cbmr_metrics_is_not_better.
+
+    % Compare the best parallelisation with the current parallelisation.
+    %
+:- func compare_best_parallelisation_and_metrics(best_parallelisation,
+        parallel_exec_metrics) = compare_best_and_metrics_result.
+
+compare_best_parallelisation_and_metrics(BestParallelisation, Metrics) = 
+        Result :-
+    (
+        BestParallelisation = bp_no_best_parallelisation,
+        Result = cbmr_metrics_is_better
+    ;
+        BestParallelisation = bp_parallel_execution(_, _, _, _, BestMetrics),
+        BestParTime = parallel_exec_metrics_get_par_time(BestMetrics),
+        ParTime = parallel_exec_metrics_get_par_time(Metrics),
+        % TODO: Add other comparisons for tie breaking or a better optimisation
+        % formula.
+        ( BestParTime > ParTime ->
+            Result = cbmr_metrics_is_better
+        ;
+            Result = cbmr_metrics_is_not_better
+        )
+    ).
 
     % This datastructure represents the execution of dependant conjuncts, it
     % tracks which variabes are produced and consumed.
@@ -1706,6 +1766,13 @@ calculate_parallel_cost(Info, Conjs, Par
     % calculate_parallel_cost(Info, Conjunctions, IsOutermostConjunct,
     %   ParallelExecMetrics, ParallelExecOverlap, ProductionsMap, NumConjuncts).
     %
+    % Analyse the parallel conjuncts and determine their likely performance.
+    % This code performs one step of the computation the steps are linked
+    % together by find_best_parallelisation.
+    %
+    % This is the new parallel execution overlap algorithm, it is general and
+    % therefore we also use is for independent conjunctions.
+    %
     % + CallerVars is the set of vars for which our caller wants us to build a
     %   productions map for.
     %
@@ -1715,21 +1782,15 @@ calculate_parallel_cost(Info, Conjs, Par
     % + ProductionsMap: A productions map that covers the production of
     %   CallerVars.
     %
-:- pred calculate_parallel_cost_2(implicit_parallelism_info::in, 
-    cord(seq_conj(pard_goal_detail))::in, is_outermost_conjunct::in,
-    parallel_exec_metrics_incomplete::out, parallel_execution_overlap::out,
-    map(var_rep, float)::out, int::out) is det.
-
-calculate_parallel_cost_2(Info, Conjs0, IsOutermostConjunct,
-        ParallelExecMetrics, ParallelExecOverlap, ProductionsMap, 
-        NumConjuncts) :-
-    ( cord.split_last(Conjs0, Conjs, Conj) ->
-        some [!ProductionsMap] (
-            calculate_parallel_cost_2(Info, Conjs, is_not_outermost_conjunct,
-                ParallelExecMetrics0, ParallelExecOverlap0, !:ProductionsMap,
-                NumConjuncts0), 
-           
-            seq_conj(Goals) = Conj,
+:- pred calculate_parallel_cost_step(implicit_parallelism_info::in,
+    is_outermost_conjunct::in, int::in, cord(pard_goal_detail)::in, 
+    map(var_rep, float)::in, map(var_rep, float)::out,
+    parallel_exec_metrics_incomplete::in, parallel_exec_metrics_incomplete::out,
+    parallel_execution_overlap::in, parallel_execution_overlap::out) is det.
+
+calculate_parallel_cost_step(Info, IsOutermostConjunct, NumConjuncts, Conj,
+        !ProductionsMap, !Metrics, !Overlap) :-
+    Goals = list(Conj),
             foldl(pardgoal_calc_cost, Goals, 0.0, CostB),
             foldl(pardgoal_consumed_vars_accum, Goals, set.init,
                 RightConsumedVars),
@@ -1741,7 +1802,7 @@ calculate_parallel_cost_2(Info, Conjs0, 
             % the prevous conjunct.  Which in turn may have been sparked by an
             % earlier conjunct.
             SparkDelay = Info ^ ipi_opts ^ cpc_sparking_delay, 
-            StartTime0 = (NumConjuncts0 * SparkDelay),
+    StartTime0 = (NumConjuncts * SparkDelay),
 
             % If there are conjuncts after this conjunct we will have the
             % additional cost of sparking them.
@@ -1761,16 +1822,16 @@ calculate_parallel_cost_2(Info, Conjs0, 
             reverse(ConsumptionsList0, ConsumptionsList),
 
             % Determine how the parallel conjuncts overlap.
-            foldl4(calculate_dependant_parallel_cost_2(Info, !.ProductionsMap), 
+    foldl5(calculate_dependant_parallel_cost_2(Info, !.ProductionsMap), 
                 ConsumptionsList, 0.0, LastSeqConsumeTime, 
-                StartTime, LastParConsumeTime, [], RevExecution0, 
-                map.init, ConsumptionsMap),
-            RevExecution = [ (LastParConsumeTime - CostBPar) | RevExecution0 ],
+        StartTime, LastParConsumeTime, StartTime, LastResumeTime, 
+        [], RevExecution0, map.init, ConsumptionsMap),
+    RevExecution = [ (LastResumeTime - CostBPar) | RevExecution0 ],
             reverse(RevExecution, Execution),
 
             CostBPar = LastParConsumeTime + (CostB - LastSeqConsumeTime),
-            ParallelExecMetrics = init_parallel_exec_metrics_incomplete(
-                ParallelExecMetrics0, CostB, CostBPar),
+    !:Metrics = 
+        init_parallel_exec_metrics_incomplete(!.Metrics, CostB, CostBPar),
             
             % Build the productions map for our parent.  This map contains all
             % the variables produced by this code, not just that are used for
@@ -1780,31 +1841,19 @@ calculate_parallel_cost_2(Info, Conjs0, 
 
             DepConjExec = dependant_conjunct_execution(Execution,
                 !.ProductionsMap, ConsumptionsMap),
-            ParallelExecOverlap = peo_conjunction(ParallelExecOverlap0, 
-                DepConjExec, Vars),
-
-            ProductionsMap = !.ProductionsMap,
-            NumConjuncts = NumConjuncts0 + 1
-        )
-    ;
-        ParallelExecMetrics = init_empty_parallel_exec_metrics,
-        ParallelExecOverlap = peo_empty_conjunct,
-        ProductionsMap = map.init,
-        NumConjuncts = 0
-    ).
-
+    !:Overlap = peo_conjunction(!.Overlap, DepConjExec, Vars).
 
     % The main loop of the parallel overlap analysis.
     %
 :- pred calculate_dependant_parallel_cost_2(implicit_parallelism_info::in, 
     map(var_rep, float)::in, pair(var_rep, float)::in, float::in, float::out,
-    float::in, float::out,
+    float::in, float::out, float::in, float::out,
     assoc_list(float, float)::in, assoc_list(float, float)::out, 
     map(var_rep, float)::in, map(var_rep, float)::out) is det.
 
 calculate_dependant_parallel_cost_2(Info, ProductionsMap, Var - SeqConsTime,
-        !PrevSeqConsumeTime, !PrevParConsumeTime, !RevExecution,
-        !ConsumptionsMap) :-
+        !PrevSeqConsumeTime, !PrevParConsumeTime, !ResumeTime,
+        !RevExecution, !ConsumptionsMap) :-
     FutureSyncTime = float(Info ^ ipi_opts ^ cpc_locking_cost),
     map.lookup(ProductionsMap, Var, ProdTime),
 
@@ -1831,7 +1880,8 @@ calculate_dependant_parallel_cost_2(Info
         ProdTime > ParConsTimeNotBlocked 
     ->
         !:RevExecution = 
-            [ (!.PrevParConsumeTime - ParConsTimeNotBlocked) | !.RevExecution ] 
+            [ (!.ResumeTime - ParConsTimeNotBlocked) | !.RevExecution ],
+        !:ResumeTime = ParConsTime
     ;
         true
     ),
@@ -1841,6 +1891,17 @@ calculate_dependant_parallel_cost_2(Info
 
     svmap.det_insert(Var, ParConsTime, !ConsumptionsMap).
 
+:- pred par_conj_overlap_is_dependant(parallel_execution_overlap::in, 
+    conjuncts_are_dependant::out) is det.
+
+par_conj_overlap_is_dependant(peo_empty_conjunct, conjuncts_are_independent).
+par_conj_overlap_is_dependant(peo_conjunction(Left, _, VarSet), IsDependent) :-
+    ( set.empty(VarSet) ->
+        par_conj_overlap_is_dependant(Left, IsDependent)
+    ;
+        IsDependent = conjuncts_are_dependant
+    ).
+
 :- pred pg_get_conj_num(pard_goal_detail::in, int::out) is det.
 
 pg_get_conj_num(PG, PG ^ pgd_conj_num).
@@ -2778,7 +2839,7 @@ transitive_map_insert(K, V, !Map) :-
 create_candidate_parallel_conj_report(Proc - CandidateParConjunction, Report) :-
     print_proc_label_to_string(Proc, ProcString),
     CandidateParConjunction = candidate_par_conjunction(GoalPath,
-        PartNum, IsDependant, Conjs, ParExecMetrics),
+        PartNum, IsDependant, GoalsBefore, Conjs, GoalsAfter, ParExecMetrics),
     NumCalls = parallel_exec_metrics_get_num_calls(ParExecMetrics),
     (
         IsDependant = conjuncts_are_independent,
@@ -2796,7 +2857,7 @@ create_candidate_parallel_conj_report(Pr
     FutureDeadTime =
         parallel_exec_metrics_get_future_dead_time(ParExecMetrics),
     TotalDeadTime = parallel_exec_metrics_get_total_dead_time(ParExecMetrics),
-    format("\n      %s\n" ++
+    format("      %s\n" ++
            "      Path and Partition Num: %s, %d\n" ++
            "      Dependant: %s\n" ++
            "      NumCalls: %d\n" ++
@@ -2806,22 +2867,25 @@ create_candidate_parallel_conj_report(Pr
            "      Time saving: %f\n" ++
            "      First conj dead time: %f\n" ++
            "      Future dead time: %f\n" ++
-           "      Total dead time: %f", 
+           "      Total dead time: %f\n\n", 
         [s(ProcString), s(GoalPath), i(PartNum), s(DependanceString),
             i(NumCalls), f(SeqTime), f(ParTime), f(Speedup), f(TimeSaving), 
             f(FirstConjDeadTime), f(FutureDeadTime), f(TotalDeadTime)],
         ReportHeaderStr),
     ReportHeader = singleton(ReportHeaderStr),
 
+    format_sequential_conjuncts(3, GoalsBefore, empty, ReportGoalsBefore),
     format_parallel_conjunction(3, Conjs, ReportParConj), 
+    format_sequential_conjuncts(3, GoalsAfter, empty, ReportGoalsAfter),
 
-    Report = snoc(ReportHeader ++ ReportParConj, "\n").
+    Report = snoc(ReportHeader ++ ReportGoalsBefore ++ ReportParConj ++ 
+        ReportGoalsAfter, "\n").
 
 :- pred format_parallel_conjunction(int::in, 
     list(seq_conj(pard_goal))::in, cord(string)::out) is det.
 
 format_parallel_conjunction(Indent, Conjs, Report) :-
-    IndentStr = nl_indent(Indent),
+    IndentStr = indent(Indent),
     Report0 = IndentStr ++ singleton("(\n"),
     format_parallel_conjuncts(Indent, Conjs, Report0, Report). 
 
@@ -2830,7 +2894,7 @@ format_parallel_conjunction(Indent, Conj
 
 format_parallel_conjuncts(Indent, [], !Report) :-
     IndentStr = indent(Indent),
-    !:Report = snoc(!.Report ++ IndentStr, ")").
+    !:Report = snoc(!.Report ++ IndentStr, ")\n").
 format_parallel_conjuncts(Indent, [ Conj | Conjs ], !Report) :-
     format_sequential_conjunction(Indent + 1, Conj, ConjReport),
     !:Report = !.Report ++ ConjReport,
@@ -2922,5 +2986,27 @@ format_callee(named_callee(Module, Proc)
     format("%s.%s", [s(Module), s(Proc)])).
 
 %-----------------------------------------------------------------------------%
+
+:- pred cord_split(cord(T)::in, cord(T)::out, cord(T)::out) is multi.
+
+cord_split(Cord, A, B) :-
+    ( cord.head_tail(Cord, Head, Tail) ->
+        (
+            % Put the split before Cord,
+            A = cord.empty,
+            B = Cord
+        ;
+            % Put the split within or after Cord.
+            cord_split(Tail, TailA, B),
+            A = cons(Head, TailA)
+        )
+    ;
+        % An empty cord can only be split one way (since we say that the split
+        % may happen before the cord).
+        A = cord.empty,
+        B = cord.empty
+    ).
+
+%-----------------------------------------------------------------------------%
 :- end_module mdprof_fb.automatic_parallelism.
 %-----------------------------------------------------------------------------%
Index: deep_profiler/mdprof_feedback.m
===================================================================
RCS file: /home/mercury1/repository/mercury/deep_profiler/mdprof_feedback.m,v
retrieving revision 1.22
diff -u -p -b -r1.22 mdprof_feedback.m
--- deep_profiler/mdprof_feedback.m	13 May 2010 02:27:38 -0000	1.22
+++ deep_profiler/mdprof_feedback.m	8 Jun 2010 07:33:25 -0000
@@ -191,7 +191,7 @@ create_feedback_report(feedback_data_can
             "    Sparking delay: %d\n" ++
             "    Locking cost: %d\n" ++
             "    Number of Parallel Conjunctions: %d\n" ++
-            "    Parallel Conjunctions:\n",
+            "    Parallel Conjunctions:\n\n",
         [f(DesiredParallelism), i(SparkingCost), i(SparkingDelay),
          i(LockingCost), i(NumConjs)])),
     map(create_candidate_parallel_conj_report, Conjs, ReportConjs),
@@ -513,7 +513,14 @@ check_options(Options0, RequestedFeedbac
             CPCCallSiteThreshold),
         lookup_bool_option(Options,
             implicit_parallelism_dependant_conjunctions,
-            ParalleliseDepConjs),
+            ParalleliseDepConjsBool),
+        (
+            ParalleliseDepConjsBool = yes,
+            ParalleliseDepConjs = parallelise_dep_conjs
+        ;
+            ParalleliseDepConjsBool = no,
+            ParalleliseDepConjs = do_not_parallelise_dep_conjs
+        ),
         CandidateParallelConjunctionsOpts =
             candidate_parallel_conjunctions_opts(DesiredParallelism, 
                 SparkingCost,
Index: mdbcomp/feedback.m
===================================================================
RCS file: /home/mercury1/repository/mercury/mdbcomp/feedback.m,v
retrieving revision 1.10
diff -u -p -b -r1.10 feedback.m
--- mdbcomp/feedback.m	13 May 2010 02:27:38 -0000	1.10
+++ mdbcomp/feedback.m	8 Jun 2010 12:53:16 -0000
@@ -119,6 +119,8 @@
 
                 cpc_is_dependant        :: conjuncts_are_dependant,
 
+                cpc_goals_before        :: list(GoalType),
+
                 cpc_conjs               :: list(seq_conj(GoalType)),
                     % A list of parallel conjuncts, each is a sequential
                     % conjunction of inner goals.  All inner goals that are
@@ -131,6 +133,8 @@
                     % algorithms to construct the same parallel conjunction
                     % from the same program representation/HLDS structure.
 
+                cpc_goals_after         :: list(GoalType),
+
                 cpc_par_exec_metrics    :: parallel_exec_metrics
             ).
 
@@ -202,20 +206,6 @@
     %
 :- type parallel_exec_metrics_incomplete.
     
-    % ParExecMetrics = init_parallel_execution_metrics(NumCalls, 
-    %   CostA, CostBSeq, CostBPar).
-    %
-    % CostA is the cost of the first conjunct in a conjunction.  CostBSeq is
-    % the cost of the second conjunct if this is a normal conjunction.
-    % CostBPar is the cost of the second conjunct if this is a parallel
-    % conjunction.
-    %
-    % This function should be used when building metrics for parallel
-    % conjunctions of exactly two conjuncts.
-    %
-:- func init_parallel_exec_metrics(int, float, float, float) =
-    parallel_exec_metrics. 
-    
     % ParMetrics = init_parallel_exec_metrics_incomplete(PartMetricsA,
     %   CostBSeq, CostBPar) 
     %
@@ -233,20 +223,26 @@
 :- func init_parallel_exec_metrics_incomplete(parallel_exec_metrics_incomplete,
     float, float) = parallel_exec_metrics_incomplete.
 
-    % StartMetrics = init_empty_parallel_exec_metrics.
+    % StartMetrics = init_empty_parallel_exec_metrics(CostBefore).
     %
     % Use this function to start with an empty set of metrics for an empty
     % conjunction.  Then use init_parallel_exec_metrics_incomplete to continue
     % adding conjuncts on the right.
     %
-:- func init_empty_parallel_exec_metrics = parallel_exec_metrics_incomplete.
+:- func init_empty_parallel_exec_metrics(float) = 
+    parallel_exec_metrics_incomplete.
 
-    % Metrics = finalise_parallel_exec_metrics(IncompleteMetrics, NumCalls).
+    % Metrics = finalise_parallel_exec_metrics(IncompleteMetrics, NumCalls,
+    %   CostAfterConj, RightConjDelay).:w
     %
     % Make the metrics structure complete.
     %
-:- func finalise_parallel_exec_metrics(parallel_exec_metrics_incomplete, int) =
-    parallel_exec_metrics.
+    % RightConjDelay is the delay before the conjunct to the right of & will
+    % begin executing.  & is considered to be right-associative since that's
+    % how sparks are sparked.
+    %
+:- func finalise_parallel_exec_metrics(parallel_exec_metrics_incomplete, 
+    int, float, float) = parallel_exec_metrics.
 
 :- func parallel_exec_metrics_get_num_calls(parallel_exec_metrics) = int.
 
@@ -573,7 +569,7 @@ read_program_name(Stream, _, Result, !IO
     io.read_line_as_string(Stream, IOResultLine, !IO),
     (
         IOResultLine = ok(String),
-        Result = ok(String)
+        Result = ok(strip(String))
     ;
         IOResultLine = eof,
         Result = error(unexpected_eof)
@@ -665,7 +661,7 @@ read_error_message_string(File, Error, M
         Error = incorrect_program_name(Expected, Got),
         MessagePart =
             "Program name didn't match, is this the right feedback file?\n"
-            ++ format("Expected: %s Got: %s", [s(Expected), s(Got)])
+            ++ format("Expected: '%s' Got: '%s'", [s(Expected), s(Got)])
     ),
     string.format("%s: %s\n", [s(File), s(MessagePart)], Message).
 
@@ -738,7 +734,7 @@ feedback_first_line = "Mercury Compiler 
 
 :- func feedback_version = string.
 
-feedback_version = "7".
+feedback_version = "8".
 
 %-----------------------------------------------------------------------------%
 %
@@ -748,9 +744,13 @@ feedback_version = "7".
 %
 
 convert_candidate_par_conjunction(Conv, CPC0, CPC) :-
-    Conjs0 = CPC0 ^ cpc_conjs,
+    CPC0 = candidate_par_conjunction(GoalPath, PartNum, IsDependent, 
+        GoalsBefore0, Conjs0, GoalsAfter0, Metrics),
     map(convert_seq_conj(Conv), Conjs0, Conjs),
-    CPC = CPC0 ^ cpc_conjs := Conjs.
+    map(Conv, GoalsBefore0, GoalsBefore),
+    map(Conv, GoalsAfter0, GoalsAfter),
+    CPC = candidate_par_conjunction(GoalPath, PartNum, IsDependent, 
+        GoalsBefore, Conjs, GoalsAfter, Metrics).
 
 convert_seq_conj(Conv, seq_conj(Conjs0), seq_conj(Conjs)) :-
     map(Conv, Conjs0, Conjs).
@@ -759,51 +759,70 @@ convert_seq_conj(Conv, seq_conj(Conjs0),
 
 :- type parallel_exec_metrics
     --->    parallel_exec_metrics(
-                inner_metrics   :: parallel_exec_metrics_incomplete,
-                num_calls       :: int
+                pem_inner_metrics       :: parallel_exec_metrics_incomplete,
+                pem_num_calls           :: int,
+                pem_time_before_conj    :: float,
+                pem_time_after_conj     :: float,
+                pem_right_conj_delay    :: float
+                    % The delay before a conjunct to the right of & begins
+                    % executing.
             ).
 
 :- type parallel_exec_metrics_incomplete
-    --->    pem_initial
+    --->    pem_initial(
+                pemi_time_before_conj    :: float
+            )
     ;       pem_additional(
-                cost_left       :: parallel_exec_metrics_incomplete,
-                    % The cost of the left conjunct (that may be a conjunction),
+                pemi_time_left           :: parallel_exec_metrics_incomplete,
+                    % The time of the left conjunct (that may be a conjunction),
 
-                cost_right_seq  :: float,
-                    % The cost of the right conjunct if it is running after
+                pemi_time_right_seq      :: float,
+                    % The time of the right conjunct if it is running after
                     % the left in normal sequential execution.
 
-                cost_right_par   :: float
-                    % THe cost of the right conjunct if it is running in
+                pemi_time_right_par      :: float
+                    % The time of the right conjunct if it is running in
                     % parallel with the left conjunct.  It may have to stop and
-                    % wait for variables to be produced; therefore this cost is
-                    % different to cost_right_seq.  This cost also includes
+                    % wait for variables to be produced; therefore this time is
+                    % different to time_right_seq.  This time also includes
                     % parallel execution overheads and delays.
             ).
 
-init_parallel_exec_metrics(NumCalls, TimeA, TimeBSeq, TimeBPar) = Metrics :-
-    Metrics0 = init_empty_parallel_exec_metrics,
-    Metrics1 = init_parallel_exec_metrics_incomplete(Metrics0, TimeA, TimeA),
-    Metrics2 = 
-        init_parallel_exec_metrics_incomplete(Metrics1, TimeBSeq, TimeBPar),
-    Metrics = finalise_parallel_exec_metrics(Metrics2, NumCalls).
-
 init_parallel_exec_metrics_incomplete(MetricsA, TimeBSeq, TimeBPar) = 
     pem_additional(MetricsA, TimeBSeq, TimeBPar).
 
-init_empty_parallel_exec_metrics = pem_initial.
-
-finalise_parallel_exec_metrics(IncompleteMetrics, NumCalls) = 
-    parallel_exec_metrics(IncompleteMetrics, NumCalls).
+init_empty_parallel_exec_metrics(TimeBefore) = pem_initial(TimeBefore).
 
-parallel_exec_metrics_get_num_calls(parallel_exec_metrics(_, NumCalls)) = 
-    NumCalls.
+finalise_parallel_exec_metrics(IncompleteMetrics, NumCalls, TimeAfter,
+        RightConjDelay) 
+    =
+        parallel_exec_metrics(IncompleteMetrics, NumCalls, TimeBefore,
+        TimeAfter, RightConjDelay) :-
+    TimeBefore = 
+        parallel_exec_metrics_incomp_get_time_before(IncompleteMetrics).
 
-parallel_exec_metrics_get_par_time(parallel_exec_metrics(Inner, _)) =
-    parallel_exec_metrics_incomp_get_par_time(Inner).
+:- func parallel_exec_metrics_incomp_get_time_before(
+    parallel_exec_metrics_incomplete) = float.
 
-parallel_exec_metrics_get_seq_time(parallel_exec_metrics(Inner, _)) =
-    parallel_exec_metrics_incomp_get_seq_time(Inner).
+parallel_exec_metrics_incomp_get_time_before(pem_initial(Time)) = Time.
+parallel_exec_metrics_incomp_get_time_before(
+        pem_additional(Left, _, _)) = Time :-
+    Time = parallel_exec_metrics_incomp_get_time_before(Left).
+
+parallel_exec_metrics_get_num_calls(PEM) = NumCalls :-
+    NumCalls = PEM ^ pem_num_calls.
+
+parallel_exec_metrics_get_par_time(PEM) = Time :-
+    Inner = PEM ^ pem_inner_metrics,
+    InnerTime = parallel_exec_metrics_incomp_get_par_time(Inner),
+    BeforeAndAfterTime = PEM ^ pem_time_before_conj + PEM ^ pem_time_after_conj,
+    Time = InnerTime + BeforeAndAfterTime.
+
+parallel_exec_metrics_get_seq_time(PEM) = Time :- 
+    Inner = PEM ^ pem_inner_metrics,
+    InnerTime = parallel_exec_metrics_incomp_get_seq_time(Inner),
+    BeforeAndAfterTime = PEM ^ pem_time_before_conj + PEM ^ pem_time_after_conj,
+    Time = InnerTime + BeforeAndAfterTime.
 
 parallel_exec_metrics_get_speedup(Metrics) = SeqTime / ParTime :-
     SeqTime = parallel_exec_metrics_get_seq_time(Metrics),
@@ -814,7 +833,7 @@ parallel_exec_metrics_get_speedup(Metric
 :- func parallel_exec_metrics_incomp_get_par_time(
     parallel_exec_metrics_incomplete) = float.
 
-parallel_exec_metrics_incomp_get_par_time(pem_initial) = 0.0.
+parallel_exec_metrics_incomp_get_par_time(pem_initial(_)) = 0.0.
 parallel_exec_metrics_incomp_get_par_time(pem_additional(MetricsLeft, _, TimeRight)) 
         = Time :-
     TimeLeft = parallel_exec_metrics_incomp_get_par_time(MetricsLeft),
@@ -825,7 +844,7 @@ parallel_exec_metrics_incomp_get_par_tim
 :- func parallel_exec_metrics_incomp_get_seq_time(
     parallel_exec_metrics_incomplete) = float.
 
-parallel_exec_metrics_incomp_get_seq_time(pem_initial) = 0.0.
+parallel_exec_metrics_incomp_get_seq_time(pem_initial(_)) = 0.0.
 parallel_exec_metrics_incomp_get_seq_time(pem_additional(MetricsLeft, TimeRight, _)) 
         = Time :-
     TimeLeft = parallel_exec_metrics_incomp_get_seq_time(MetricsLeft),
@@ -836,18 +855,18 @@ parallel_exec_metrics_get_time_saving(Me
     ParTime = parallel_exec_metrics_get_par_time(Metrics).
 
 parallel_exec_metrics_get_first_conj_dead_time(Metrics) = DeadTime :-
-    parallel_exec_metrics(Inner, _) = Metrics,
+    Inner = Metrics ^ pem_inner_metrics,
     FirstConjTime = pem_get_first_conj_time(Inner),
     MaxConjTime = pem_get_max_conj_time(Inner, 0.0),
     DeadTime = MaxConjTime - FirstConjTime.
 
 :- func pem_get_first_conj_time(parallel_exec_metrics_incomplete) = float.
 
-pem_get_first_conj_time(pem_initial) = _ :- 
+pem_get_first_conj_time(pem_initial(_)) = _ :- 
     error("pem_get_first_conj_time: Empty conjunction").
 pem_get_first_conj_time(pem_additional(Left, _RightSeq, RightPar)) = Time :-
     (
-        Left = pem_initial,
+        Left = pem_initial(_),
         Time = RightPar
     ;
         Left = pem_additional(_, _, _),
@@ -856,22 +875,33 @@ pem_get_first_conj_time(pem_additional(L
 
 :- func pem_get_max_conj_time(parallel_exec_metrics_incomplete, float) = float.
 
-pem_get_max_conj_time(pem_initial, Max) = Max.
+pem_get_max_conj_time(pem_initial(_), Max) = Max.
 pem_get_max_conj_time(pem_additional(Left, _, Par), Max0) = Max :-
     Max1 = max(Par, Max0),
     Max = pem_get_max_conj_time(Left, Max1).
 
 parallel_exec_metrics_get_future_dead_time(Metrics) = DeadTime :-
-    parallel_exec_metrics(Inner, _) = Metrics,
-    DeadTime = pem_get_future_dead_time(Inner).
+    Inner = Metrics ^ pem_inner_metrics,
+    RightConjDelay = Metrics ^ pem_right_conj_delay,
+    DeadTime = pem_get_future_dead_time(Inner, RightConjDelay).
 
-:- func pem_get_future_dead_time(parallel_exec_metrics_incomplete) = float.
+:- func pem_get_future_dead_time(parallel_exec_metrics_incomplete, float) 
+    = float.
 
-pem_get_future_dead_time(pem_initial) = 0.0.
-pem_get_future_dead_time(pem_additional(Left, Seq, Par)) = DeadTime :-
+pem_get_future_dead_time(pem_initial(_), _) = 0.0.
+pem_get_future_dead_time(pem_additional(Left, Seq, Par), Delay) = DeadTime :-
     DeadTime = ThisDeadTime + LeftDeadTime,
-    ThisDeadTime = Par - Seq,
-    LeftDeadTime = pem_get_future_dead_time(Left).
+    % Only use the delay if this conjunction contains some code in it's left
+    % conjunct.
+    (
+        Left = pem_initial(_),
+        ThisDelay = 0.0
+    ;
+        Left = pem_additional(_, _, _),
+        ThisDelay = Delay
+    ),
+    ThisDeadTime = Par - Seq - ThisDelay,
+    LeftDeadTime = pem_get_future_dead_time(Left, Delay).
 
 parallel_exec_metrics_get_total_dead_time(Metrics) = DeadTime :-
     DeadTime = FirstConjDeadTime + FutureDeadTime,
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 489 bytes
Desc: Digital signature
URL: <http://lists.mercurylang.org/archives/reviews/attachments/20100608/6417fb6f/attachment.sig>


More information about the reviews mailing list