Branches: main

         New module which reads the profiling feedback file and decides where
         parallelism should be introduced.
         Moved find_shared_variables in the interface (Needed for implicit_parallelism.m).
         Add the call to apply implicit parallelism.
         Add implicit-parallelism and feedback-file options.
         Add transform_paralellism in the imported modules.
         Add "all" option to dump everything out of the Deep.data file.
         Rename distribution to measure.
         Add handling of dump_stages and dump_options.
         Correct the way the list of CSS is built (elems were put in the wrong order).

  :- module transform_hlds.dep_par_conj.
  :- interface.

+:- import_module hlds.hlds_goal.
  :- import_module hlds.hlds_module.
+:- import_module hlds.instmap.
+:- import_module parse_tree.prog_data.

  :- import_module io.
+:- import_module set.


  :- pred dependent_par_conj(module_info::in, module_info::out, io::di, io::uo)
      is det.

+    %Used by transform_hlds.implicit_parallelism.
+:- func find_shared_variables(module_info, instmap, hlds_goals)
+    = set(prog_var).

  :- import_module check_hlds.mode_util.
  :- import_module check_hlds.purity.
  :- import_module hlds.goal_util.
-:- import_module hlds.hlds_goal.
  :- import_module hlds.hlds_pred.
-:- import_module hlds.instmap.
  :- import_module hlds.pred_table.
  :- import_module hlds.quantification.
  :- import_module libs.compiler_util.
  :- import_module libs.globals.
  :- import_module libs.options.
  :- import_module mdbcomp.prim_data.
-:- import_module parse_tree.prog_data.
  :- import_module parse_tree.prog_type.
  :- import_module parse_tree.prog_util.
  :- import_module transform_hlds.dependency_graph.
  :- import_module map.
  :- import_module maybe.
  :- import_module pair.
-:- import_module set.
  :- import_module std_util.
  :- import_module string.
  :- import_module svmap.
@@ -678,9 +682,6 @@
      % XXX this code is probably too complicated.  I think Thomas already had a
      % more elegant way to find the shared variables somewhere, using multisets.
-:- func find_shared_variables(module_info, instmap, hlds_goals)
-    = set(prog_var).
  find_shared_variables(ModuleInfo, InstMap, Goals) = SharedVars :-
      list.map2(get_nonlocals_and_instmaps, Goals, Nonlocals, InstMapDeltas),
      find_shared_variables_2(ModuleInfo, 0, Nonlocals, InstMap, InstMapDeltas,
+% vim: ft=mercury ts=4 sw=4 et
+% Copyright (C) 2006 The University of Melbourne.
+% This file may only be copied under the terms of the GNU General
+% Public License - see the file COPYING in the Mercury distribution.
+% Author: tannier.
+% This module reads the profiling feedback file generated by the mdprof_feedback
+% module and decides where parallelism should be used (implicit parallelism).
+%TODO - Once a call which has to be parallelized is found, search forward AND
+%       backward for the closet goal to be parallelized/parallel conjunction and
+%       determine which side is the best (on the basis of the number of shared
+%       variables).
+:- module transform_hlds.implicit_parallelism.
+:- interface.
+:- import_module hlds.hlds_module.
+:- import_module io.
+:- pred apply_implicit_parallelism_transformation(module_info::in,
+    module_info::out, string::in, io::di, io::uo) is det.
+:- implementation.
+:- import_module check_hlds.inst_match.
+:- import_module check_hlds.mode_util.
+:- import_module hlds.goal_util.
+:- import_module hlds.hlds_goal.
+:- import_module hlds.hlds_pred.
+:- import_module hlds.instmap.
+:- import_module hlds.quantification.
+:- import_module mdbcomp.prim_data.
+:- import_module parse_tree.error_util.
+:- import_module parse_tree.prog_data.
+:- import_module transform_hlds.dep_par_conj.
+:- import_module bool.
+:- import_module char.
+:- import_module counter.
+:- import_module int.
+:- import_module list.
+:- import_module maybe.
+:- import_module pair.
+:- import_module require.
+:- import_module set.
+:- import_module string.
+    % Represent a call site static which has to be parallelized.
+    %
+:- type css_to_be_parallelized
+    --->    css_to_be_parallelized(
+                caller      :: string,
+                slot_number :: int,
+                kind        :: string,
+                callee      :: string
+            ).
+apply_implicit_parallelism_transformation(!ModuleInfo, FeedbackFile, !IO) :-
+    parse_feedback_file(FeedbackFile, MaybeCSSListToBeParallelized, !IO),
+    (
+        MaybeCSSListToBeParallelized = error(Err),
+        io.stderr_stream(Stderr, !IO),
+        io.write_string(Stderr, Err ++ "\n", !IO)
+    ;
+        MaybeCSSListToBeParallelized = ok(CSSListToBeParallelized),
+        module_info_predids(!.ModuleInfo, PredIds),
+        process_preds_for_implicit_parallelism(PredIds,
+            CSSListToBeParallelized, !ModuleInfo)
+    ).
+    % Process predicates for implicit parallelism.
+    %
+:- pred process_preds_for_implicit_parallelism(list(pred_id)::in,
+    list(css_to_be_parallelized)::in, module_info::in, module_info::out)
+    is det.
+process_preds_for_implicit_parallelism([], _CSSListToBeParallelized,
+        !ModuleInfo).
+process_preds_for_implicit_parallelism([PredId | PredIdList],
+        CSSListToBeParallelized, !ModuleInfo) :-
+    module_info_pred_info(!.ModuleInfo, PredId, PredInfo),
+    ProcIds = pred_info_non_imported_procids(PredInfo),
+    process_procs_for_implicit_parallelism(PredId, ProcIds,
+        CSSListToBeParallelized, !ModuleInfo),
+    process_preds_for_implicit_parallelism(PredIdList,
+        CSSListToBeParallelized, !ModuleInfo).
+    % Process procedures for implicit parallelism.
+    %
+:- pred process_procs_for_implicit_parallelism(pred_id::in,
+    list(proc_id)::in, list(css_to_be_parallelized)::in,
+    module_info::in, module_info::out) is det.
+process_procs_for_implicit_parallelism(_PredId, [],
+        _CSSListToBeParallelized, !ModuleInfo).
+process_procs_for_implicit_parallelism(PredId, [ProcId | ProcIds],
+        CSSListToBeParallelized, !ModuleInfo) :-
+    module_info_pred_proc_info(!.ModuleInfo, PredId, ProcId,
+        PredInfo0, ProcInfo0),
+    % Initialize the counter for the slot number.
+    Counter = counter.init(0),
+    pred_proc_id_to_raw_id(PredInfo0, ProcId, CallerRawId),
+    get_callees_feedback(CallerRawId, CSSListToBeParallelized, [],
+        CalleesList),
+    list.length(CalleesList, CalleesListLength),
+    ( CalleesListLength = 0 ->
+        % No calls to be parallelized in this procedure.
+        process_procs_for_implicit_parallelism(PredId, ProcIds,
+            CSSListToBeParallelized, !ModuleInfo)
+    ;
+        proc_info_get_goal(ProcInfo0, Body0),
+        process_goal_for_implicit_parallelism(Body0, Body, ProcInfo0,
+            !ModuleInfo, no, _, 0, _, CalleesList, _, Counter, _),
+        proc_info_set_goal(Body, ProcInfo0, ProcInfo1),
+        pred_info_get_markers(PredInfo0, Markers0),
+        add_marker(marker_may_have_parallel_conj, Markers0, Markers),
+        pred_info_set_markers(Markers, PredInfo0, PredInfo1),
+        requantify_proc(ProcInfo1, ProcInfo2),
+        RecomputeAtomic = no,
+        recompute_instmap_delta_proc(RecomputeAtomic, ProcInfo2, ProcInfo,
+            !ModuleInfo),
+        pred_info_set_proc_info(ProcId, ProcInfo, PredInfo1, PredInfo),
+        module_info_set_pred_info(PredId, PredInfo, !ModuleInfo),
+        process_procs_for_implicit_parallelism(PredId, ProcIds,
+            CSSListToBeParallelized, !ModuleInfo)
+    ).
+    % By using the feedback file, build a list of css_to_be_parallelized whose
+    % caller is equal to the first parameter.
+    %
+:- pred get_callees_feedback(string::in, list(css_to_be_parallelized)::in,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out) is det.
+get_callees_feedback(_Caller, [], !ResultAcc).
+get_callees_feedback(Caller, [CSSToBeParallelized | CSSListToBeParallelized],
+        !ResultAcc) :-
+    CSSToBeParallelized = css_to_be_parallelized(CSSCaller, _, _, _),
+    ( Caller = CSSCaller ->
+        !:ResultAcc = [ CSSToBeParallelized | !.ResultAcc ],
+        get_callees_feedback(Caller, CSSListToBeParallelized, !ResultAcc)
+    ;
+        get_callees_feedback(Caller, CSSListToBeParallelized, !ResultAcc)
+    ).
+    % Process a goal for implicit parallelism.
+    % MaybeConj is the conjunction which contains HLDSGoal.
+    %
+:- pred process_goal_for_implicit_parallelism(hlds_goal::in, hlds_goal::out,
+    proc_info::in, module_info::in, module_info::out,
+    maybe(hlds_goal_expr)::in, maybe(hlds_goal_expr)::out, int ::in, int::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out,
+    counter::in, counter::out) is det.
+process_goal_for_implicit_parallelism(!HLDSGoal, ProcInfo, !ModuleInfo,
+        !MaybeConj, !IndexInConj, !CalleeListToBeParallelized, !Counter) :-
+    !.HLDSGoal = HLDSGoalExpr0 - HLDSGoalInfo,
+    (
+        HLDSGoalExpr0 = unify(_, _, _, _, _),
+        increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+    ;
+        HLDSGoalExpr0 = plain_call(_, _, _, _, _, _),
+        process_call_for_implicit_parallelism(!.HLDSGoal, ProcInfo, !ModuleInfo,
+            !IndexInConj, !MaybeConj, !CalleeListToBeParallelized, !Counter)
+        % We deal with the index in the conjunction in
+        % process_call_for_implicit_parallelism.
+    ;
+        HLDSGoalExpr0 = call_foreign_proc(_, _, _, _, _, _, _),
+        process_call_for_implicit_parallelism(!.HLDSGoal, ProcInfo, !ModuleInfo,
+            !IndexInConj, !MaybeConj, !CalleeListToBeParallelized, !Counter)
+    ;
+        HLDSGoalExpr0 = generic_call(Details, _, _, _),
+        (
+            Details = higher_order(_, _, _, _),
+            process_call_for_implicit_parallelism(!.HLDSGoal, ProcInfo,
+                !ModuleInfo, !IndexInConj, !MaybeConj,
+                !CalleeListToBeParallelized, !Counter)
+        ;
+            Details = class_method(_, _, _, _),
+            process_call_for_implicit_parallelism(!.HLDSGoal, ProcInfo,
+                !ModuleInfo, !IndexInConj, !MaybeConj,
+                !CalleeListToBeParallelized, !Counter)
+        ;
+            Details = event_call(_),
+            increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+        ;
+            Details = cast(_),
+            increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+        )
+    ;
+        % No distinction is made between plain conjunctions and parallel
+        % conjunctions. We have to process the parallel conjunction for the
+        % slot number.
+        HLDSGoalExpr0 = conj(_, _),
+        process_conj_for_implicit_parallelism(HLDSGoalExpr0, HLDSGoalExpr, 1,
+            ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter),
+        % A plain conjunction will never be contained in an other plain
+        % conjunction. As for parallel conjunctions, they wont
+        % be modified. Therefore, incrementing the index suffices (no need to
+        % call update_conj_and_index).
+        !:HLDSGoal = HLDSGoalExpr - HLDSGoalInfo,
+        increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+    ;
+        HLDSGoalExpr0 = disj(HLDSGoals0),
+        process_disj_for_implicit_parallelism(HLDSGoals0, [], HLDSGoals,
+            ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter),
+        GoalProcessed = disj(HLDSGoals) - HLDSGoalInfo,
+        update_conj_and_index(!MaybeConj, GoalProcessed, !IndexInConj),
+        % If we are not in a conjunction, then we need to return the modified
+        % value of HLDSGoal. In we are in a conjunction, that information is not
+        % read (see process_conj_for_implicit_parallelism).
+        !:HLDSGoal = GoalProcessed
+    ;
+        HLDSGoalExpr0 = switch(Var, CanFail, Cases0),
+        process_switch_cases_for_implicit_parallelism(Cases0, [], Cases,
+            ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter),
+        GoalProcessed = switch(Var, CanFail, Cases) - HLDSGoalInfo,
+        update_conj_and_index(!MaybeConj, GoalProcessed, !IndexInConj),
+        !:HLDSGoal = GoalProcessed
+    ;
+        HLDSGoalExpr0 = negation(HLDSGoal0),
+        process_goal_for_implicit_parallelism(HLDSGoal0, HLDSGoal, ProcInfo,
+            !ModuleInfo, !MaybeConj, !IndexInConj, !CalleeListToBeParallelized,
+            !Counter),
+        GoalProcessed = negation(HLDSGoal) - HLDSGoalInfo,
+        update_conj_and_index(!MaybeConj, GoalProcessed, !IndexInConj),
+        !:HLDSGoal = GoalProcessed
+    ;
+        HLDSGoalExpr0 = scope(_, _),
+        increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+    ;
+        HLDSGoalExpr0 = if_then_else(Vars, If0, Then0, Else0),
+        % 0 is the default value when we are not in a conjunction (in this case
+        % an if then else).
+        process_goal_for_implicit_parallelism(If0, If, ProcInfo, !ModuleInfo,
+            no, _, 0, _, !CalleeListToBeParallelized, !Counter),
+        process_goal_for_implicit_parallelism(Then0, Then, ProcInfo, !ModuleInfo
+            , no, _, 0, _, !CalleeListToBeParallelized, !Counter),
+        process_goal_for_implicit_parallelism(Else0, Else, ProcInfo, !ModuleInfo
+            , no, _, 0, _, !CalleeListToBeParallelized, !Counter),
+        GoalProcessed = if_then_else(Vars, If, Then, Else) - HLDSGoalInfo,
+        update_conj_and_index(!MaybeConj, GoalProcessed, !IndexInConj),
+        !:HLDSGoal = GoalProcessed
+    ;
+        HLDSGoalExpr0 = shorthand(_),
+        increment_index_if_in_conj(!.MaybeConj, !IndexInConj)
+    ).
+    % Increment the index if we are in a conjunction.
+    %
+:- pred increment_index_if_in_conj(maybe(hlds_goal_expr)::in, int::in, int::out)
+    is det.
+increment_index_if_in_conj(MaybeConj, !IndexInConj) :-
+    (
+        MaybeConj = yes(_),
+        !:IndexInConj = !.IndexInConj + 1
+    ;
+        MaybeConj = no
+    ).
+    % Process a call for implicit parallelism.
+    %
+:- pred process_call_for_implicit_parallelism(hlds_goal::in, proc_info::in,
+    module_info::in, module_info::out, int::in, int::out,
+    maybe(hlds_goal_expr)::in, maybe(hlds_goal_expr)::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out,
+    counter::in, counter::out) is det.
+process_call_for_implicit_parallelism(Call, ProcInfo, !ModuleInfo, !IndexInConj
+        , !MaybeConj, !CalleeListToBeParallelized, !Counter) :-
+    counter.allocate(SlotNumber, !Counter),
+    get_call_kind_and_callee(!.ModuleInfo, Call, Kind, CalleeRawId),
+    ( !.MaybeConj = yes(Conj0), Conj0 = conj(plain_conj, ConjGoals0)
+    ->
+        (is_in_css_list_to_be_parallelized(Kind, SlotNumber, CalleeRawId,
+            !.CalleeListToBeParallelized, [], !:CalleeListToBeParallelized)
+        ->
+            ( build_goals_surrounded_by_calls_to_be_parallelized(ConjGoals0,
+                !.ModuleInfo, [Call], Goals, !.IndexInConj + 1, End, !Counter,
+                !CalleeListToBeParallelized)
+            ->
+                parallelize_calls(Goals, !.IndexInConj, End, Conj0, Conj,
+                    ProcInfo, !ModuleInfo),
+                !:IndexInConj = End,
+                !:MaybeConj = yes(Conj)
+            ;
+                % The next call is not in the feedback file or we've hit a
+                % plain conjunction/disjunction/switch/if then else.
+                !:IndexInConj = !.IndexInConj + 1
+            )
+        ;
+            % Not to be parallelized.
+            !:IndexInConj = !.IndexInConj + 1
+        )
+    ;
+        % Call is not in a conjunction or the call is already in a parallel
+        % conjunction.
+        true
+    ).
+    % Give the raw id (the same as in the deep profiler) of a callee contained
+    % in a call.
+    %
+:- pred get_call_kind_and_callee(module_info::in, hlds_goal::in, string::out,
+    string::out) is det.
+get_call_kind_and_callee(ModuleInfo, Call, Kind, CalleeRawId) :-
+    GoalExpr = fst(Call),
+    ( GoalExpr = plain_call(PredId, ProcId, _, _, _, _)
+    ->
+        module_info_pred_proc_info(ModuleInfo, PredId, ProcId,
+            PredInfo, _),
+        pred_proc_id_to_raw_id(PredInfo, ProcId, CalleeRawId),
+        Kind = "normal_call"
+    ;
+        ( GoalExpr = call_foreign_proc(_, PredId, ProcId, _, _, _, _)
+        ->
+            module_info_pred_proc_info(ModuleInfo, PredId, ProcId,
+                PredInfo, _),
+            pred_proc_id_to_raw_id(PredInfo, ProcId, CalleeRawId),
+            Kind = "special_call"
+        ;
+            ( GoalExpr = generic_call(Details, _, _, _)
+            ->
+                (
+                    Details = higher_order(_, _, _, _),
+                    CalleeRawId = "",
+                    Kind = "higher_order_call"
+                ;
+                    Details = class_method(_, _, _, _),
+                    CalleeRawId = "",
+                    Kind = "method_call"
+                ;
+                    Details = event_call(_),
+                    error("get_call_kind_and_callee:: the call is an event" ++
+                        " call")
+                ;
+                    Details = cast(_),
+                    error("get_call_kind_and_callee: the call is a cast")
+                )
+            ;
+                error("get_call_kind_and_callee: not a call")
+            )
+        )
+    ).
+    % Convert a pred_info and a proc_id to the raw procedure id (the same used
+    % in the deep profiler).
+    %
+:- pred pred_proc_id_to_raw_id(pred_info::in, proc_id::in, string::out) is det.
+pred_proc_id_to_raw_id(PredInfo, ProcId, RawId) :-
+    Module = pred_info_module(PredInfo),
+    Name = pred_info_name(PredInfo),
+    OrigArity = pred_info_orig_arity(PredInfo),
+    IsPredOrFunc = pred_info_is_pred_or_func(PredInfo),
+    ModuleString = sym_name_to_string(Module),
+    ProcIdInt = proc_id_to_int(ProcId),
+    RawId = string.append_list([ModuleString, ".", Name, "/",
+        string.int_to_string(OrigArity),
+        ( IsPredOrFunc = function -> "+1" ; ""), "-",
+        string.from_int(ProcIdInt)]).
+    % Succeed if the caller, slot number and callee correspond to a
+    % css_to_be_parallelized in the list given as a parameter.
+    % Fail otherwise.
+    %
+:- pred is_in_css_list_to_be_parallelized(string::in, int::in, string::in,
+    list(css_to_be_parallelized)::in,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out)
+    is semidet.
+is_in_css_list_to_be_parallelized(Kind, SlotNumber, CalleeRawId,
+        CSSListToBeParallelized, !ResultAcc) :-
+    (
+        CSSListToBeParallelized = [],
+        fail
+    ;
+        CSSListToBeParallelized = [ CSSToBeParallelized |
+            CSSListToBeParallelized0],
+        CSSToBeParallelized = css_to_be_parallelized(_, CSSSlotNumber, CSSKind,
+            CSSCallee),
+        % =< because there is not a one to one correspondance with the source
+        % code. New calls might have been added by the previous stages of the
+        % compiler.
+        ( CSSSlotNumber =< SlotNumber, CSSKind = Kind, CSSCallee = CalleeRawId
+        ->
+            list.append(!.ResultAcc, CSSListToBeParallelized0, !:ResultAcc)
+        ;
+            list.append(!.ResultAcc, [CSSToBeParallelized], !:ResultAcc),
+            is_in_css_list_to_be_parallelized(Kind, SlotNumber, CalleeRawId,
+                CSSListToBeParallelized0, !ResultAcc)
+        )
+    ).
+    % Build a list of goals surrounded by two calls which are in the feedback
+    % file or by a call which is in the feedback file and a parallel
+    % conjunction.
+    %
+    % Succeed if we can build that list of goals.
+    % Fail otherwise.
+    %
+:- pred build_goals_surrounded_by_calls_to_be_parallelized(list(hlds_goal)::in,
+    module_info::in, list(hlds_goal)::in, list(hlds_goal)::out,
+    int::in, int::out, counter::in, counter::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out)
+    is semidet.
+build_goals_surrounded_by_calls_to_be_parallelized(ConjGoals, ModuleInfo,
+        !ResultAcc, !Index, !Counter, !CalleeListToBeParallelized) :-
+    list.length(ConjGoals, Length),
+    ( !.Index > Length
+    ->
+        fail
+    ;
+        list.index1_det(ConjGoals, !.Index, Goal),
+        GoalExpr = fst(Goal),
+        ( ( GoalExpr = disj(_)
+            ; GoalExpr = switch(_, _, _)
+            ; GoalExpr = if_then_else(_, _, _, _)
+            ; GoalExpr = conj(plain_conj, _)
+        ) ->
+            fail
+        ;
+            ( is_a_conjunct(Goal, parallel_conj) ->
+                list.append(!.ResultAcc, [Goal], !:ResultAcc)
+            ;
+                ( is_a_call(Goal)
+                ->
+                    counter.allocate(SlotNumber, !Counter),
+                    get_call_kind_and_callee(ModuleInfo, Goal, Kind,
+                        CalleeRawId),
+                    ( is_in_css_list_to_be_parallelized(Kind, SlotNumber,
+                        CalleeRawId, !.CalleeListToBeParallelized, [],
+                        !:CalleeListToBeParallelized)
+                    ->
+                        list.append(!.ResultAcc, [Goal], !:ResultAcc)
+                    ;
+                        list.append(!.ResultAcc, [Goal], !:ResultAcc),
+                        !:Index = !.Index + 1,
+                        build_goals_surrounded_by_calls_to_be_parallelized(
+                            ConjGoals, ModuleInfo, !ResultAcc, !Index, !Counter,
+                            !CalleeListToBeParallelized)
+                    )
+                ;
+                    list.append(!.ResultAcc, [Goal], !:ResultAcc),
+                    !:Index = !.Index + 1,
+                    build_goals_surrounded_by_calls_to_be_parallelized(
+                        ConjGoals, ModuleInfo, !ResultAcc, !Index, !Counter,
+                        !CalleeListToBeParallelized)
+                )
+            )
+        )
+    ).
+    % Succeed if the Goal is a conjunction.
+    % Fail otherwise.
+    %
+:- pred is_a_conjunct(hlds_goal::in, conj_type::out) is semidet.
+is_a_conjunct(Goal, Type) :-
+    GoalExpr = fst(Goal),
+    GoalExpr = conj(Type, _).
+    % Succeed if Goal is a call or a call inside a negation.
+    % Fail otherwise.
+    %
+:- pred is_a_call(hlds_goal::in) is semidet.
+is_a_call(Goal) :-
+    GoalExpr = fst(Goal),
+    (
+        GoalExpr = plain_call(_, _, _, _, _, _)
+    ;
+        GoalExpr = call_foreign_proc(_, _, _, _, _, _, _)
+    ;
+        GoalExpr = generic_call(Details, _, _, _),
+        (
+            Details = class_method(_, _, _, _)
+        ;
+            Details = higher_order(_, _, _, _)
+        )
+    ;
+        GoalExpr = negation(GoalNeg),
+        GoalNegExpr = fst(GoalNeg),
+        (
+            GoalNegExpr = plain_call(_, _, _, _, _, _)
+        ;
+            GoalNegExpr = call_foreign_proc(_, _, _, _, _, _, _)
+        ;
+            GoalNegExpr = generic_call(Details, _, _, _),
+            (
+                Details = class_method(_, _, _, _)
+            ;
+                Details = higher_order(_, _, _, _)
+            )
+        )
+    ).
+    % Parallelize two calls/a call and a parallel conjunction which might have
+    % goals between them. If these have no dependencies with the first call then
+    % we move them before the first call and parallelize the two calls/call and
+    % parallel conjunction.
+    %
+    % Goals is contained in Conj.
+    %
+:- pred parallelize_calls(list(hlds_goal)::in, int::in, int::in,
+    hlds_goal_expr::in, hlds_goal_expr::out, proc_info::in,
+    module_info::in, module_info::out) is det.
+parallelize_calls(Goals, Start, End, !Conj, ProcInfo, !ModuleInfo) :-
+    ( !.Conj = conj(plain_conj, ConjGoals0) ->
+        ( ConjGoals0 = [FirstGoal, LastGoal] ->
+            ( is_worth_parallelizing(FirstGoal, LastGoal, ProcInfo,
+                !.ModuleInfo)
+            ->
+                ( is_a_conjunct(LastGoal, parallel_conj) ->
+                    % The parallel conjunction has to remain flatened.
+                    add_call_to_parallel_conjunction(FirstGoal, LastGoal,
+                        ParallelGoal),
+                    !:Conj = fst(ParallelGoal)
+                ;
+                    !:Conj = conj(parallel_conj, ConjGoals0)
+                )
+            ;
+                % Not worth parallelizing.
+                true
+            )
+        ;
+            % There are more than two goals in the conjunction.
+            list.length(Goals, Length),
+            list.index1_det(Goals, 1, FirstGoal),
+            list.index1_det(Goals, Length, LastGoal),
+            ( is_worth_parallelizing(FirstGoal, LastGoal, ProcInfo,
+                !.ModuleInfo)
+            ->
+                GoalsInBetweenAndLast = list.det_tail(Goals),
+                list.delete_all(GoalsInBetweenAndLast, LastGoal,
+                    GoalsInBetween),
+                % Check the dependencies of GoalsInBetween with FirstGoal.
+                list.filter(goal_depends_on_goal(FirstGoal),
+                    GoalsInBetween, GoalsFiltered),
+                ( list.is_empty(GoalsFiltered) ->
+                    ( is_a_conjunct(LastGoal, parallel_conj) ->
+                        add_call_to_parallel_conjunction(FirstGoal, LastGoal,
+                            ParallelGoal)
+                    ;
+                        create_parallel_conj(FirstGoal, LastGoal, ParallelGoal)
+                    ),
+                    ( Start = 1 ->
+                        GoalsFront = []
+                    ;
+                        list.det_split_list(Start - 1, ConjGoals0,
+                            GoalsFront, _)
+                    ),
+                    list.length(ConjGoals0, ConjLength),
+                    ( End = ConjLength ->
+                        GoalsBack = []
+                    ;
+                        list.det_split_list(End, ConjGoals0, _,
+                            GoalsBack)
+                    ),
+                    list.append(GoalsFront, GoalsInBetween,
+                        GoalsFrontWithBetween),
+                    list.append(GoalsFrontWithBetween, [ParallelGoal],
+                        GoalsWithoutBack),
+                    list.append(GoalsWithoutBack, GoalsBack, ConjGoals),
+                    !:Conj = conj(plain_conj, ConjGoals)
+                ;
+                    % The goals between the two calls/call and parallel
+                    % conjunction can't be moved before the first call.
+                    true
+                )
+            ;
+                % Not worth parallelizing.
+                true
+            )
+        )
+    ;
+        error("parallelize_calls: not in a plain conjunct ")
+    ).
+    % Two calls are worth parallelizing if the number of shared variables is
+    % smaller than the number of argument variables of at least one of the two
+    % calls.
+    %
+    % A call and a parallel conjunction are worth parallelizing if the number of
+    % shared variables is smaller than the number of argument variables of the
+    % call.
+    %
+    % Succeed if it is worth parallelizing the two goals.
+    % Fail otherwise.
+    %
+    %
+:- pred is_worth_parallelizing(hlds_goal::in, hlds_goal::in, proc_info::in,
+    module_info::in) is semidet.
+is_worth_parallelizing(GoalA, GoalB, ProcInfo, ModuleInfo) :-
+    proc_info_get_initial_instmap(ProcInfo, ModuleInfo, InstMap),
+    SharedVars = find_shared_variables(ModuleInfo, InstMap, [GoalA, GoalB]),
+    set.to_sorted_list(SharedVars, SharedVarsList),
+    list.length(SharedVarsList, NbSharedVars),
+    ( NbSharedVars = 0 ->
+        % No shared vars between the goals.
+        true
+    ;
+        ( is_a_conjunct(GoalB, parallel_conj) ->
+            get_number_args(GoalA, NbArgsA),
+            NbSharedVars < NbArgsA
+        ;
+            ( get_number_args(GoalA, NbArgsA), get_number_args(GoalB, NbArgsB)
+            ->
+                ( ( NbSharedVars < NbArgsA, NbSharedVars < NbArgsB
+                    ;   NbSharedVars = NbArgsA, NbSharedVars < NbArgsB
+                    ;   NbSharedVars < NbArgsA, NbSharedVars = NbArgsB
+                    )
+                ->
+                    true
+                ;
+                    fail
+                )
+            ;
+                error("is_worth_parallelizing: GoalA and/or GoalB are/is" ++
+                    " not a call")
+            )
+        )
+    ).
+    % Give the number of argument variables of a call.
+    %
+:- pred get_number_args(hlds_goal::in, int::out) is semidet.
+get_number_args(Call, NbArgs) :-
+    CallExpr = fst(Call),
+    (
+        CallExpr = plain_call(_, _, Args, _, _, _),
+        list.length(Args, NbArgs)
+    ;
+        CallExpr = generic_call(Details, Args, _, _),
+        (
+            Details = higher_order(_, _, _, _),
+            list.length(Args, NbArgs)
+        ;
+            Details = class_method(_, _, _, _),
+            list.length(Args, NbArgs)
+        )
+    ;
+        CallExpr = call_foreign_proc(_, _, _, Args, _, _, _),
+        list.length(Args, NbArgs)
+    ).
+    % Add a call to an existing parallel conjunction.
+    %
+:- pred add_call_to_parallel_conjunction(hlds_goal::in, hlds_goal::in,
+    hlds_goal::out) is det.
+add_call_to_parallel_conjunction(Call, ParallelGoal0, ParallelGoal) :-
+    ParallelGoalExpr0 = fst(ParallelGoal0),
+    ParallelGoalInfo0 = snd(ParallelGoal0),
+    ( ParallelGoalExpr0 = conj(parallel_conj, GoalList0) ->
+        GoalList = [Call | GoalList0],
+        goal_list_nonlocals(GoalList, NonLocals),
+        goal_list_instmap_delta(GoalList, InstMapDelta),
+        goal_list_determinism(GoalList, Detism),
+        goal_list_purity(GoalList, Purity),
+        goal_info_set_nonlocals(NonLocals, ParallelGoalInfo0,
+            ParallelGoalInfo1),
+        goal_info_set_instmap_delta(InstMapDelta, ParallelGoalInfo1,
+            ParallelGoalInfo2),
+        goal_info_set_determinism(Detism, ParallelGoalInfo2, ParallelGoalInfo3),
+        goal_info_set_purity(Purity, ParallelGoalInfo3, ParallelGoalInfo),
+        ParallelGoalExpr = conj(parallel_conj, GoalList),
+        ParallelGoal = ParallelGoalExpr - ParallelGoalInfo
+    ;
+        error("add_call_to_parallel_conjunction: ParallelGoal0 is not a " ++
+            "parallel conjunction")
+    ).
+    % Create a parallel conjunction.
+    %
+:- pred create_parallel_conj(hlds_goal::in, hlds_goal::in, hlds_goal::out)
+    is det.
+create_parallel_conj(GoalA, GoalB, ParallelGoal) :-
+    GoalsInParallelConj = [GoalA, GoalB],
+    ParallelGoalExpr = conj(parallel_conj, GoalsInParallelConj),
+    goal_list_nonlocals(GoalsInParallelConj, NonLocals),
+    goal_list_instmap_delta(GoalsInParallelConj, InstMapDelta),
+    goal_list_determinism(GoalsInParallelConj, Detism),
+    goal_list_purity(GoalsInParallelConj, Purity),
+    GoalAInfo = snd(GoalA),
+    goal_info_get_context(GoalAInfo, Context),
+    goal_info_init(NonLocals, InstMapDelta, Detism, Purity, Context,
+        ParallelGoalInfo),
+    ParallelGoal = ParallelGoalExpr - ParallelGoalInfo.
+    % Succeed if the first goal depends on the second one.
+    % Fail otherwise.
+    %
+:- pred goal_depends_on_goal(hlds_goal::in, hlds_goal::in) is semidet.
+goal_depends_on_goal(_ - GoalInfo1, _ - GoalInfo2) :-
+    goal_info_get_instmap_delta(GoalInfo1, InstmapDelta1),
+    instmap_delta_changed_vars(InstmapDelta1, ChangedVars1),
+    goal_info_get_nonlocals(GoalInfo2, NonLocals2),
+    set.intersect(ChangedVars1, NonLocals2, Intersection),
+    \+ set.empty(Intersection).
+    % Process a conjunction for implicit parallelism.
+    %
+:- pred process_conj_for_implicit_parallelism(
+    hlds_goal_expr::in, hlds_goal_expr::out, int::in,
+    proc_info::in, module_info::in, module_info::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out,
+    counter::in, counter::out) is det.
+process_conj_for_implicit_parallelism(!HLDSGoalExpr, IndexInConj, ProcInfo,
+        !ModuleInfo, !CalleeListToBeParallelized, !Counter) :-
+    ( !.HLDSGoalExpr = conj(_, HLDSGoalsConj) ->
+        list.length(HLDSGoalsConj, Length),
+        ( IndexInConj > Length ->
+            true
+        ;
+            MaybeConj0 = yes(!.HLDSGoalExpr),
+            list.index1_det(HLDSGoalsConj, IndexInConj, HLDSGoalInConj),
+            % We are not interested in the return value of HLDSGoalInConj, only
+            % MaybeConj matters.
+            process_goal_for_implicit_parallelism(HLDSGoalInConj, _, ProcInfo,
+                !ModuleInfo, MaybeConj0, MaybeConj, IndexInConj, IndexInConj0,
+                !CalleeListToBeParallelized, !Counter),
+            ( MaybeConj = yes(HLDSGoalExprProcessed) ->
+                !:HLDSGoalExpr = HLDSGoalExprProcessed
+            ;
+                error("process_conj_for_implicit_parallelism: wrong maybe" ++
+                    " value")
+            ),
+            process_conj_for_implicit_parallelism(!HLDSGoalExpr, IndexInConj0,
+                ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter)
+        )
+    ;
+        error("process_conj_for_implicit_parallelism: not a conjunct")
+    ).
+    % Process a disjunction for implicit parallelism.
+    %
+:- pred process_disj_for_implicit_parallelism(
+    list(hlds_goal)::in, list(hlds_goal)::in, list(hlds_goal)::out,
+    proc_info::in, module_info::in, module_info::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out,
+    counter::in, counter::out) is det.
+process_disj_for_implicit_parallelism([], !HLDSGoalsAcc, _ProcInfo,
+        !ModuleInfo, !CalleeListToBeParallelized, !Counter).
+process_disj_for_implicit_parallelism([HLDSGoal | HLDSGoals0], !HLDSGoalsAcc,
+        ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter) :-
+    process_goal_for_implicit_parallelism(HLDSGoal, HLDSGoalProcessed,
+        ProcInfo, !ModuleInfo, no, _, 0, _, !CalleeListToBeParallelized,
+        !Counter),
+    !:HLDSGoalsAcc = [ HLDSGoalProcessed | !.HLDSGoalsAcc ],
+    process_disj_for_implicit_parallelism(HLDSGoals0, !HLDSGoalsAcc, ProcInfo,
+        !ModuleInfo, !CalleeListToBeParallelized, !Counter).
+    % If we are in a conjunction, update it by replacing the goal at index by
+    % Goal and increment the index.
+    %
+:- pred update_conj_and_index(
+    maybe(hlds_goal_expr)::in, maybe(hlds_goal_expr)::out,
+    hlds_goal::in, int::in, int::out) is det.
+update_conj_and_index(!MaybeConj, Goal, !IndexInConj) :-
+    ( !.MaybeConj = yes(conj(Type, Goals0)) ->
+        list.replace_nth_det(Goals0, !.IndexInConj, Goal, Goals),
+        !:IndexInConj = !.IndexInConj + 1,
+        !:MaybeConj = yes(conj(Type, Goals))
+    ;
+        true
+    ).
+    % Process a switch for implicit parallelism.
+    %
+:- pred process_switch_cases_for_implicit_parallelism(
+    list(case)::in, list(case)::in, list(case)::out, proc_info::in,
+    module_info::in, module_info::out,
+    list(css_to_be_parallelized)::in, list(css_to_be_parallelized)::out,
+    counter::in, counter::out) is det.
+process_switch_cases_for_implicit_parallelism([], !CasesAcc, _ProcInfo,
+        !ModuleInfo, !CalleeListToBeParallelized, !Counter).
+process_switch_cases_for_implicit_parallelism([Case | Cases0], !CasesAcc,
+        ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter) :-
+    Case = case(Functor, Goal0),
+    process_goal_for_implicit_parallelism(Goal0, Goal, ProcInfo,
+        !ModuleInfo, no, _, 0, _, !CalleeListToBeParallelized, !Counter),
+    !:CasesAcc = [ case(Functor, Goal) | !.CasesAcc ],
+    process_switch_cases_for_implicit_parallelism(Cases0, !CasesAcc,
+        ProcInfo, !ModuleInfo, !CalleeListToBeParallelized, !Counter).
+    % Parse the feedback file (header and body).
+    %
+:- pred parse_feedback_file(string::in,
+    maybe_error(list(css_to_be_parallelized))::out, io::di, io::uo) is det.
+parse_feedback_file(InputFile, MaybeCSSListToBeParallelized, !IO) :-
+    io.open_input(InputFile, Result, !IO),
+    (
+        Result = io.error(ErrInput),
+        MaybeCSSListToBeParallelized = error(io.error_message(ErrInput))
+    ;
+        Result = ok(InStrm),
+        io.read_file_as_string(InStrm, MaybeFileAsString, !IO),
+        (
+            MaybeFileAsString = ok(FileAsString),
+            LineList = string.words_separator(is_carriage_return,
+                FileAsString),
+            process_header(LineList, MaybeBodyFileAsListString, !IO),
+            (
+                MaybeBodyFileAsListString = error(ErrProcessHeader),
+                MaybeCSSListToBeParallelized = error(ErrProcessHeader)
+            ;
+                MaybeBodyFileAsListString = ok(BodyFileAsListString),
+                process_body(BodyFileAsListString,
+                    MaybeCSSListToBeParallelized0),
+                (
+                    MaybeCSSListToBeParallelized0 =
+                        ok(CSSListToBeParallelized),
+                    MaybeCSSListToBeParallelized = ok(CSSListToBeParallelized)
+                ;
+                    MaybeCSSListToBeParallelized0 = error(Err),
+                    MaybeCSSListToBeParallelized = error(Err)
+                )
+            )
+        ;
+            MaybeFileAsString = error(_, ErrReadFileAsString),
+            MaybeCSSListToBeParallelized =
+                error(io.error_message(ErrReadFileAsString))
+        ),
+        io.close_input(InStrm, !IO)
+    ).
+:- pred is_carriage_return(char::in) is semidet.
+is_carriage_return(Char) :- Char = '\n'.
+    % Process the header of the feedback file.
+    %
+:- pred process_header(list(string)::in, maybe_error(list(string))::out,
+    io::di, io::uo) is det.
+process_header(FileAsListString, MaybeFileAsListStringWithoutHeader, !IO) :-
+    ( list.index0(FileAsListString, 0, Type) ->
+        ( Type = "Profiling feedback file" ->
+            (list.index0(FileAsListString, 1, Version) ->
+                ( Version = "Version = 1.0" ->
+                    list.det_split_list(4, FileAsListString, _,
+                        FileAsListStringWithoutHeader),
+                    MaybeFileAsListStringWithoutHeader =
+                        ok(FileAsListStringWithoutHeader)
+                ;
+                    MaybeFileAsListStringWithoutHeader = error("Profiling" ++
+                    " feedback file version incorrect")
+                )
+            ;
+                MaybeFileAsListStringWithoutHeader = error("Not a profiling"
+                ++ " feedback file")
+            )
+        ;
+            MaybeFileAsListStringWithoutHeader = error("Not a profiling" ++
+                " feedback file")
+        )
+    ;
+        MaybeFileAsListStringWithoutHeader = error("Not a profiling feedback"
+            ++ " file")
+    ).
+    % Process the body of the feedback file.
+    %
+:- pred process_body(list(string)::in,
+    maybe_error(list(css_to_be_parallelized))::out) is det.
+process_body(CoreFileAsListString, MaybeCSSListToBeParallelized) :-
+    ( process_body2(CoreFileAsListString, [], CSSListToBeParallelized) ->
+        MaybeCSSListToBeParallelized = ok(CSSListToBeParallelized)
+    ;
+        MaybeCSSListToBeParallelized = error("Profiling feedback file has been"
+            ++ " tampered with")
+    ).
+:- pred process_body2(list(string)::in, list(css_to_be_parallelized)::in,
+    list(css_to_be_parallelized)::out) is semidet.
+process_body2([], !CSSListToBeParallelizedAcc).
+process_body2([Line | Lines], !CSSListToBeParallelizedAcc) :-
+    Words = string.words_separator(is_whitespace, Line),
+    list.index0_det(Words, 0, Caller),
+    ( Caller = "Mercury" ->
+        process_body2(Lines, !CSSListToBeParallelizedAcc)
+    ;
+        list.index0_det(Words, 1, SlotNumber),
+        string.to_int(SlotNumber, IntSlotNumber),
+        list.index0_det(Words, 2, Kind),
+        ( Kind = "normal_call" ->
+            list.index0_det(Words, 3, Callee),
+            CSStoBeParallelized = css_to_be_parallelized(Caller, IntSlotNumber,
+                Kind, Callee)
+        ;
+            CSStoBeParallelized = css_to_be_parallelized(Caller, IntSlotNumber,
+                Kind, "")
+        ),
+        !:CSSListToBeParallelizedAcc = [ CSStoBeParallelized |
+            !.CSSListToBeParallelizedAcc ],
+        process_body2(Lines, !CSSListToBeParallelizedAcc)
+    ).
+:- end_module transform_hlds.implicit_parallelism.
  :- import_module transform_hlds.trailing_analysis.
  :- import_module transform_hlds.tabling_analysis.
  :- import_module transform_hlds.higher_order.
+:- import_module transform_hlds.implicit_parallelism.
  :- import_module transform_hlds.accumulator.
  :- import_module transform_hlds.tupling.
  :- import_module transform_hlds.untupling.
@@ -2414,7 +2415,7 @@
      module_info_get_globals(!.HLDS, Globals),
      globals.lookup_bool_option(Globals, verbose, Verbose),
      globals.lookup_bool_option(Globals, statistics, Stats),
      maybe_read_experimental_complexity_file(!HLDS, !IO),

      tabling(Verbose, Stats, !HLDS, !IO),
@@ -2523,6 +2524,9 @@
      maybe_structure_reuse_analysis(Verbose, Stats, !HLDS, !IO),
      maybe_dump_hlds(!.HLDS, 195, "structure_reuse", !DumpInfo, !IO),

+    maybe_implicit_parallelism(Verbose, Stats, !HLDS, !IO),
+    maybe_dump_hlds(!.HLDS, 199, "implicit_parallelism", !DumpInfo, !IO),
      maybe_control_granularity(Verbose, Stats, !HLDS, !IO),
      maybe_dump_hlds(!.HLDS, 200, "granularity", !DumpInfo, !IO),

@@ -3828,6 +3832,30 @@
          maybe_report_stats(Stats, !IO)
          Sharing = no
+    ).
+:- pred maybe_implicit_parallelism(bool::in, bool::in,
+    module_info::in, module_info::out, io::di, io::uo) is det.
+maybe_implicit_parallelism(Verbose, Stats, !HLDS, !IO) :-
+    module_info_get_globals(!.HLDS, Globals),
+    globals.lookup_bool_option(Globals, implicit_parallelism,
+        ImplicitParallelism),
+    globals.lookup_string_option(Globals, feedback_file,
+        FeedbackFile),
+    ( FeedbackFile = "" ->
+        true
+    ;
+        ( ImplicitParallelism = yes ->
+            maybe_write_string(Verbose, "% Applying implicit parallelism...\n",
+                !IO),
+            maybe_flush_output(Verbose, !IO),
+            apply_implicit_parallelism_transformation(!HLDS, FeedbackFile, !IO),
+            maybe_write_string(Verbose, "% done.\n", !IO),
+            maybe_report_stats(Stats, !IO)
+        ;
+            true
+        )

  :- pred maybe_control_granularity(bool::in, bool::in,
@@ -548,6 +548,7 @@
      ;       tuple_min_args
      ;       control_granularity
      ;       parallelism_target
+    ;       implicit_parallelism

      % Stuff for the CTGC system (structure sharing / structure reuse).
      ;       structure_sharing_analysis
@@ -789,7 +790,7 @@
              % recent when no other test can easily be constructed in
              % configure.in.

-    ;       experiment.
+    ;       experiment
              % This option is provided for use by implementors who want to
              % compare a new way of doing something with the old way. The idea
              % is that the code that switches between the two ways should
@@ -802,6 +803,7 @@
              % option to control their code, but adding an option requires
              % recompiling most of the modules in the compiler. Having this
              % option permanently here should reduce the need for that.
+    ;       feedback_file.

@@ -1296,6 +1298,7 @@
      tuple_min_args                      -   int(4),
      control_granularity                 -   bool(no),
      parallelism_target                  -   int(4),
+    implicit_parallelism                -   bool(no),

      % HLDS -> LLDS
      smart_indexing                      -   bool(no),
@@ -1533,7 +1536,8 @@
      fullarch                            -   string(""),
      local_module_id                     -   accumulating([]),
      compiler_sufficiently_recent        -   bool(no),
-    experiment                          -   string("")
+    experiment                          -   string(""),
+    feedback_file                       -   string("")

      % please keep this in alphabetic order
@@ -2047,6 +2051,7 @@
  long_option("tuple-min-args",       tuple_min_args).
  long_option("control-granularity",  control_granularity).
  long_option("parallelism-target",   parallelism_target).
+long_option("implicit-parallelism",  implicit_parallelism).

  % CTGC related options.
  long_option("structure-sharing",    structure_sharing_analysis).
@@ -2307,6 +2312,7 @@
  long_option("no-noncompact-ho-call-2004-01-15", compiler_sufficiently_recent).
  long_option("trace-io-builtins-2006-08-14", compiler_sufficiently_recent).
  long_option("experiment",           experiment).
+long_option("feedback-file",        feedback_file).


@@ -4215,7 +4221,10 @@
          "\thandle, which is specified using --parallelism-target.",
          "--parallelism-target N",
          "\tSpecified the number of CPUs of the target machine, for use by",
-        "\tthe --control-granularity option."
+        "\tthe --control-granularity option.",
+        "--implicit-parallelism",
+        "\tApply implicit parallelism if a profiling feedback file is",
+        "\tspecified using the feedback-file option."

  :- pred options_help_hlds_llds_optimization(io::di, io::uo) is det.
@@ -4711,12 +4720,15 @@
          "\tcompile several modules without the overhead of process",
          "\tcreation for each one.)",
-        "\tDisplay the compiler version."
+        "\tDisplay the compiler version.",

          % The `--fullarch' option is reserved for
          % use by the `Mercury.config' file.

          % The `--local-module-id' option is used by `mmc --make'.
+       "--feedback-file",
+        "\tUse the specified feedback file which may currently only",
+        "\tprocessed for implicit parallelism."

  :- pred write_tabbed_lines(list(string)::in, io::di, io::uo) is det.
@@ -85,6 +85,7 @@
  :- include_module untupling.
  :- include_module granularity.
  :- include_module dep_par_conj.
+:- include_module implicit_parallelism.

  :- include_module mmc_analysis.

@@ -742,6 +742,7 @@

  should_dump(DumpOptions, What) :-
      ( list.member(What, DumpOptions)
+    ; list.member("all", DumpOptions)
      ; DumpOptions = []

@@ -64,17 +64,22 @@
              write_help_message(ProgName, !IO)
              ( Args = [Input, Output] ->
-                lookup_string_option(Options, distribution, Distribution),
-                ( construct_distribution(Distribution, DistributionType) ->
+                lookup_string_option(Options, measure, Measure),
+                ( construct_measure(Measure, MeasureType) ->
                      lookup_int_option(Options, threshold, Threshold),
                      lookup_bool_option(Options, verbose, Verbose),
-                    read_deep_file(Input, Verbose, MaybeProfile, !IO),
+                    lookup_accumulating_option(Options, dump_stages,
+                        DumpStages),
+                    lookup_accumulating_option(Options, dump_options,
+                        DumpOptions),
+                    read_deep_file(Input, Verbose, DumpStages, DumpOptions,
+                        MaybeProfile, !IO),
                          MaybeProfile = ok(Deep),
                          compute_css_list_above_threshold(0, Deep, Threshold,
-                            DistributionType, [], CSSListAboveThreshold),
+                            MeasureType, [], CSSListAboveThreshold),
                          generate_feedback_file(CSSListAboveThreshold, Deep,
-                            DistributionType, Threshold, Output, !IO)
+                            MeasureType, Threshold, Output, !IO)
                          MaybeProfile = error(Error),
                          io.stderr_stream(Stderr, !IO),
@@ -111,7 +116,7 @@
      io.format("--verbose   Generate progress messages.\n", []),
      io.format("--threshold <value>\n", []),
      io.format("            Set the threshold to <value>.\n",[]),
-    io.format("--distrib average|median\n",[]),
+    io.format("--measure average|median\n",[]),
      io.format("            average : Write to <output> the call sites\n",[]),
      io.format("            static whose call sites dynamic's average\n",[]),
      io.format("            call sequence counts exceed the given\n",[]),
@@ -135,10 +140,11 @@

      % Read a deep profiling data file.
-:- pred read_deep_file(string::in, bool::in, maybe_error(deep)::out,
-    io::di, io::uo) is det.
+:- pred read_deep_file(string::in, bool::in,
+    list(string)::in, list(string)::in,
+    maybe_error(deep)::out, io::di, io::uo) is det.

-read_deep_file(Input, Verbose, MaybeProfile, !IO) :-
+read_deep_file(Input, Verbose, DumpStages, DumpOptions, MaybeProfile, !IO) :-
      server_name(Machine, !IO),
          Verbose = yes,
@@ -148,17 +154,17 @@
          Verbose = no,
          MaybeOutput = no
-    read_and_startup(Machine, [Input], no, MaybeOutput, [], [], MaybeProfile,
-        !IO).
+    read_and_startup(Machine, [Input], no, MaybeOutput,
+        DumpStages, DumpOptions, MaybeProfile, !IO).

      % Determine those CSSs whose CSDs' average/median call sequence counts
      % exceed the given threshold.
  :- pred compute_css_list_above_threshold(int::in, deep::in, int::in,
-    distribution_type::in, list(call_site_static)::in,
+    measure_type::in, list(call_site_static)::in,
      list(call_site_static)::out) is det.

-compute_css_list_above_threshold(Index, Deep, Threshold, Distribution,
+compute_css_list_above_threshold(Index, Deep, Threshold, Measure,
          !CSSAcc) :-
      array.size(Deep ^ call_site_statics, Size),
      ( Index = Size ->
@@ -173,13 +179,13 @@
              Callseqs = 0
-                Distribution = average,
+                Measure = average,
                  list.foldr(sum_callseqs_csd_ptr(Deep), CSDList,
                      0, SumCallseqs),
                  % NOTE: we have checked that NumCSD is not zero above.
                  Callseqs = SumCallseqs // NumCSD
-                Distribution = median,
+                Measure = median,
                  list.sort(compare_csd_ptr(Deep), CSDList, CSDListSorted),
                  IndexMedian = NumCSD // 2,
                  list.index0_det(CSDListSorted, IndexMedian, MedianPtr),
@@ -188,12 +194,12 @@
          ( Callseqs >= Threshold ->
              CSS = array.lookup(Deep ^ call_site_statics, Index),
-            !:CSSAcc = [ CSS | !.CSSAcc ],
+            list.append(!.CSSAcc, [CSS], !:CSSAcc),
              compute_css_list_above_threshold(Index + 1, Deep, Threshold,
-                Distribution, !CSSAcc)
+                Measure, !CSSAcc)
              compute_css_list_above_threshold(Index + 1, Deep, Threshold,
-                Distribution, !CSSAcc)
+                Measure, !CSSAcc)

@@ -223,9 +229,9 @@
      % threshold.
  :- pred generate_feedback_file(list(call_site_static)::in, deep::in,
-    distribution_type::in, int::in, string::in, io::di, io::uo) is det.
+    measure_type::in, int::in, string::in, io::di, io::uo) is det.

-generate_feedback_file(CSSList, Deep, Distribution, Threshold, Output, !IO) :-
+generate_feedback_file(CSSList, Deep, Measure, Threshold, Output, !IO) :-
      io.open_output(Output, Result, !IO),
          Result = io.error(Err),
@@ -236,11 +242,11 @@
          io.write_string(Stream, "Profiling feedback file\n", !IO),
          io.write_string(Stream, "Version = 1.0\n", !IO),
-            Distribution = average,
-            io.write_string(Stream, "Distribution = average\n", !IO)
+            Measure = average,
+            io.write_string(Stream, "Measure = average\n", !IO)
-            Distribution = median,
-            io.write_string(Stream, "Distribution = median\n", !IO)
+            Measure = median,
+            io.write_string(Stream, "Measure = median\n", !IO)
          io.format(Stream, "Threshold = %i\n", [i(Threshold)], !IO),
          write_css_list(CSSList, Deep, Stream, !IO),
@@ -290,9 +296,11 @@
      ;       help
      ;       verbose
      ;       version
-    ;       distribution.
+    ;       measure
+    ;       dump_stages
+    ;       dump_options.

-:- type distribution_type
+:- type measure_type
      --->    average
      ;       median.

@@ -304,7 +312,9 @@
  short('t',  threshold).
  short('h',  help).
  short('v',  version).
-short('d',  distribution).
+short('m',  measure).
+short('d',  dump_stages).
+short('D',  dump_options).

  :- pred long(string::in, option::out) is semidet.
@@ -313,8 +323,9 @@
  long("help",                help).
  long("verbose",             verbose).
  long("version",             version).
-long("distrib",             distribution).
-long("distribution",        distribution).
+long("measure",             measure).
+long("dump-stages",         dump_stages).
+long("dump-options",        dump_options).

  :- pred defaults(option::out, option_data::out) is multi.

@@ -322,12 +333,14 @@
  defaults(help,              bool(no)).
  defaults(verbose,           bool(no)).
  defaults(version,           bool(no)).
-defaults(distribution,      string("average")).
+defaults(measure,           string("average")).
+defaults(dump_stages,       accumulating([])).
+defaults(dump_options,      accumulating([])).

-:- pred construct_distribution(string::in, distribution_type::out) is semidet.
+:- pred construct_measure(string::in, measure_type::out) is semidet.

-construct_distribution("average",    average).
-construct_distribution("median",     median).
+construct_measure("average",    average).
+construct_measure("median",     median).

  :- end_module mdprof_feedback.
