[scheduler][restart] Reduce live-locking by using data produced on failure

Summary: When a worker fails because it can't a get the lock of a `Procname` it will include it in the exception that it throws so the `RestartScheduler` can record it as a dependency. Then when scheduling a new work item from `RestartScheduler.next` it will check if this dependency is already met, if it isn't it will not schedule the `Procname` yet.

Reviewed By: ngorogiannis

Differential Revision: D19820331

fbshipit-source-id: b48cacc9a
master
Fernando Gasperi Jabalera 5 years ago committed by Facebook Github Bot
parent b2a2919ce2
commit 5c5609591e

@ -163,7 +163,7 @@ module AbstractInterpreterCommon (TransferFunctions : TransferFunctions.SIL) = s
Ok post
with exn ->
IExn.reraise_if exn ~f:(fun () ->
match exn with ProcessPool.ProcnameAlreadyLocked -> true | _ -> false ) ;
match exn with RestartScheduler.ProcnameAlreadyLocked _ -> true | _ -> false ) ;
(* delay reraising to get a chance to write the debug HTML *)
let backtrace = Caml.Printexc.get_raw_backtrace () in
Error (exn, backtrace, instr)

@ -6,4 +6,4 @@
*)
open! IStd
val make : SourceFile.t list -> SchedulerTypes.target ProcessPool.TaskGenerator.t
val make : SourceFile.t list -> (SchedulerTypes.target, Procname.t) ProcessPool.TaskGenerator.t

@ -20,15 +20,18 @@ let clear_caches () =
BufferOverrunUtils.clear_cache ()
let analyze_target : SchedulerTypes.target Tasks.doer =
let analyze_target : (SchedulerTypes.target, Procname.t) Tasks.doer =
let analyze_source_file exe_env source_file =
if Topl.is_active () then DB.Results_dir.init (Topl.sourcefile ()) ;
DB.Results_dir.init source_file ;
L.task_progress SourceFile.pp source_file ~f:(fun () ->
Ondemand.analyze_file exe_env source_file ;
if Topl.is_active () && Config.debug_mode then
DotCfg.emit_frontend_cfg (Topl.sourcefile ()) (Topl.cfg ()) ;
if Config.write_html then Printer.write_all_html_files source_file )
try
Ondemand.analyze_file exe_env source_file ;
if Topl.is_active () && Config.debug_mode then
DotCfg.emit_frontend_cfg (Topl.sourcefile ()) (Topl.cfg ()) ;
if Config.write_html then Printer.write_all_html_files source_file ;
None
with RestartScheduler.ProcnameAlreadyLocked pname -> Some pname )
in
(* In call-graph scheduling, log progress every [per_procedure_logging_granularity] procedures.
The default roughly reflects the average number of procedures in a C++ file. *)
@ -41,7 +44,10 @@ let analyze_target : SchedulerTypes.target Tasks.doer =
L.log_task "Analysing block of %d procs, starting with %a@." per_procedure_logging_granularity
Procname.pp proc_name ;
procs_left := per_procedure_logging_granularity ) ;
Ondemand.analyze_proc_name_toplevel exe_env proc_name
try
Ondemand.analyze_proc_name_toplevel exe_env proc_name ;
None
with RestartScheduler.ProcnameAlreadyLocked pname -> Some pname
in
fun target ->
let exe_env = Exe_env.mk () in

@ -6,6 +6,7 @@
*)
open! IStd
module L = Logging
exception UnlockNotLocked of Procname.t
@ -45,3 +46,10 @@ let try_lock pname =
Unix.symlink ~target:locks_target ~link_name:(filename_from pname) ;
true
with Unix.Unix_error (Unix.EEXIST, _, _) -> false )
let is_locked pname =
try
ignore (Unix.lstat (filename_from pname)) ;
true
with Unix.Unix_error (Unix.ENOENT, _, _) -> false

@ -22,3 +22,5 @@ val unlock : Procname.t -> unit
val clean : unit -> unit
(** This should be called when locks will no longer be used to remove any files or state that's not
necessary. *)
val is_locked : Procname.t -> bool

@ -7,13 +7,30 @@
open! IStd
module L = Logging
let of_list (lst : 'a list) : 'a ProcessPool.TaskGenerator.t =
exception ProcnameAlreadyLocked of Procname.t
type work_with_dependency = {work: SchedulerTypes.target; need_to_finish: Procname.t option}
let of_list (lst : work_with_dependency list) : ('a, Procname.t) ProcessPool.TaskGenerator.t =
let content = Queue.of_list lst in
let remaining = ref (Queue.length content) in
let remaining_tasks () = !remaining in
let is_empty () = Int.equal !remaining 0 in
let finished ~completed work = if completed then decr remaining else Queue.enqueue content work in
let next () = Queue.dequeue content in
let finished ~result work =
match result with
| None ->
decr remaining
| Some _ as need_to_finish ->
Queue.enqueue content {work; need_to_finish}
in
let work_if_dependency_allows w =
match w.need_to_finish with
| Some pname when ProcLocker.is_locked pname ->
Queue.enqueue content w ; None
| None | Some _ ->
Some w.work
in
let next () = Option.bind (Queue.dequeue content) ~f:(fun w -> work_if_dependency_allows w) in
{remaining_tasks; is_empty; finished; next}
@ -21,9 +38,12 @@ let make sources =
let pnames =
List.map sources ~f:SourceFiles.proc_names_of_source
|> List.concat
|> List.rev_map ~f:(fun procname -> SchedulerTypes.Procname procname)
|> List.rev_map ~f:(fun procname ->
{work= SchedulerTypes.Procname procname; need_to_finish= None} )
in
let files =
List.map sources ~f:(fun file -> {work= SchedulerTypes.File file; need_to_finish= None})
in
let files = List.map sources ~f:(fun file -> SchedulerTypes.File file) in
let permute = List.permute ~random_state:(Random.State.make (Array.create ~len:1 0)) in
permute pnames @ permute files |> of_list
@ -44,7 +64,7 @@ let lock_exn pname =
if ProcLocker.try_lock pname then record_locked_proc pname
else (
unlock_all () ;
raise ProcessPool.ProcnameAlreadyLocked ) )
raise (ProcnameAlreadyLocked pname) ) )
let unlock pname =

@ -6,6 +6,8 @@
*)
open! IStd
exception ProcnameAlreadyLocked of Procname.t
val setup : unit -> unit
val clean : unit -> unit
@ -14,4 +16,4 @@ val lock_exn : Procname.t -> unit
val unlock : Procname.t -> unit
val make : SourceFile.t list -> SchedulerTypes.target ProcessPool.TaskGenerator.t
val make : SourceFile.t list -> (SchedulerTypes.target, Procname.t) ProcessPool.TaskGenerator.t

@ -46,7 +46,7 @@ let build_from_sources sources =
g
let bottom_up sources : SchedulerTypes.target ProcessPool.TaskGenerator.t =
let bottom_up sources : (SchedulerTypes.target, Procname.t) ProcessPool.TaskGenerator.t =
let open SchedulerTypes in
let syntactic_call_graph = build_from_sources sources in
let remaining = ref (CallGraph.n_procs syntactic_call_graph) in
@ -83,7 +83,7 @@ let bottom_up sources : SchedulerTypes.target ProcessPool.TaskGenerator.t =
CallGraph.flag syntactic_call_graph n.pname ;
Some (Procname n.pname)
in
let finished ~completed:_ = function
let finished ~result:_ = function
| Procname pname ->
decr remaining ;
decr scheduled ;

@ -6,7 +6,7 @@
*)
open! IStd
val make : SourceFile.t list -> SchedulerTypes.target ProcessPool.TaskGenerator.t
val make : SourceFile.t list -> (SchedulerTypes.target, Procname.t) ProcessPool.TaskGenerator.t
(** task generator that works by
- loading the syntactic call graph from the capture DB

@ -7,7 +7,7 @@
open! IStd
type 'a doer = 'a -> unit
type ('a, 'b) doer = 'a -> 'b option
let fork_protect ~f x = BackendStats.reset () ; ForkUtils.protect ~f x
@ -17,7 +17,7 @@ let with_new_db_connection ~f () =
module Runner = struct
type ('work, 'final) t = ('work, 'final) ProcessPool.t
type ('work, 'final, 'result) t = ('work, 'final, 'result) ProcessPool.t
let create ~jobs ~f ~child_epilogue ~tasks =
PerfEvent.(
@ -41,7 +41,7 @@ module Runner = struct
ProcessPool.run runner
end
let run_sequentially ~(f : 'a doer) (tasks : 'a list) : unit =
let run_sequentially ~(f : ('a, 'b) doer) (tasks : 'a list) : unit =
let task_generator = ProcessPool.TaskGenerator.of_list tasks in
let task_bar = TaskBar.create ~jobs:1 in
(ProcessPoolState.update_status :=
@ -53,8 +53,8 @@ let run_sequentially ~(f : 'a doer) (tasks : 'a list) : unit =
let rec run_tasks () =
if not (task_generator.is_empty ()) then (
Option.iter (task_generator.next ()) ~f:(fun t ->
f t ;
task_generator.finished ~completed:true t ) ;
let result = f t in
task_generator.finished ~result t ) ;
TaskBar.set_remaining_tasks task_bar (task_generator.remaining_tasks ()) ;
TaskBar.refresh task_bar ;
run_tasks () )

@ -7,9 +7,9 @@
open! IStd
type 'a doer = 'a -> unit
type ('a, 'b) doer = 'a -> 'b option
val run_sequentially : f:'a doer -> 'a list -> unit
val run_sequentially : f:('a, 'b) doer -> 'a list -> unit
(** Run the tasks sequentially *)
val fork_protect : f:('a -> 'b) -> 'a -> 'b
@ -17,16 +17,16 @@ val fork_protect : f:('a -> 'b) -> 'a -> 'b
(** A runner accepts new tasks repeatedly for parallel execution *)
module Runner : sig
type ('work, 'final) t
type ('work, 'final, 'result) t
val create :
jobs:int
-> f:'work doer
-> f:('work, 'result) doer
-> child_epilogue:(unit -> 'final)
-> tasks:(unit -> 'work ProcessPool.TaskGenerator.t)
-> ('work, 'final) t
-> tasks:(unit -> ('work, 'result) ProcessPool.TaskGenerator.t)
-> ('work, 'final, 'result) t
(** Create a runner running [jobs] jobs in parallel *)
val run : (_, 'final) t -> 'final option Array.t
val run : (_, 'final, _) t -> 'final option Array.t
(** Start the given tasks with the runner and wait until completion *)
end

@ -219,7 +219,7 @@ let run_proc_analysis ~caller_pdesc callee_pdesc =
let backtrace = Printexc.get_backtrace () in
IExn.reraise_if exn ~f:(fun () ->
match exn with
| ProcessPool.ProcnameAlreadyLocked ->
| RestartScheduler.ProcnameAlreadyLocked _ ->
clear_actives () ; true
| _ ->
if not !logged_error then (

@ -213,7 +213,11 @@ let log_task fmt =
log ~to_console progress_file_fmts fmt
let task_progress ~f pp x = log_task "%a starting@." pp x ; f () ; log_task "%a DONE@." pp x
let task_progress ~f pp x =
log_task "%a starting@." pp x ;
let result = f () in
log_task "%a DONE@." pp x ; result
let user_warning fmt = log ~to_console:(not Config.quiet) user_warning_file_fmts fmt

@ -26,7 +26,7 @@ val progress : ('a, F.formatter, unit) format -> 'a
val log_task : ('a, F.formatter, unit) format -> 'a
(** log progress in the log file and on the console unless there is an active task bar *)
val task_progress : f:(unit -> unit) -> (F.formatter -> 'a -> unit) -> 'a -> unit
val task_progress : f:(unit -> 'a) -> (F.formatter -> 'b -> unit) -> 'b -> 'a
(** [task_progress ~f pp x] executes [f] and log progress [pp x] in the log file and also on the
console unless there is an active task bar *)

@ -9,16 +9,14 @@ open! IStd
module F = Format
module L = Logging
exception ProcnameAlreadyLocked
module TaskGenerator = struct
type 'a t =
type ('a, 'b) t =
{ remaining_tasks: unit -> int
; is_empty: unit -> bool
; finished: completed:bool -> 'a -> unit
; finished: result:'b option -> 'a -> unit
; next: unit -> 'a option }
let chain (gen1 : 'a t) (gen2 : 'a t) : 'a t =
let chain (gen1 : ('a, 'b) t) (gen2 : ('a, 'b) t) : ('a, 'b) t =
let remaining_tasks () = gen1.remaining_tasks () + gen2.remaining_tasks () in
let gen1_returned_empty = ref false in
let gen1_is_empty () =
@ -26,20 +24,19 @@ module TaskGenerator = struct
!gen1_returned_empty
in
let is_empty () = gen1_is_empty () && gen2.is_empty () in
let finished ~completed work_item =
if gen1_is_empty () then gen2.finished ~completed work_item
else gen1.finished ~completed work_item
let finished ~result work_item =
if gen1_is_empty () then gen2.finished ~result work_item else gen1.finished ~result work_item
in
let next x = if gen1_is_empty () then gen2.next x else gen1.next x in
{remaining_tasks; is_empty; finished; next}
let of_list (lst : 'a list) : 'a t =
let of_list (lst : 'a list) : ('a, _) t =
let content = ref lst in
let length = ref (List.length lst) in
let remaining_tasks () = !length in
let is_empty () = List.is_empty !content in
let finished ~completed:_ _work_item = decr length in
let finished ~result:_ _work_item = decr length in
let next () =
match !content with
| [] ->
@ -65,7 +62,7 @@ type child_info = {pid: Pid.t; down_pipe: Out_channel.t}
type 'a child_state = Initializing | Idle | Processing of 'a
(** the state of the pool *)
type ('work, 'final) t =
type ('work, 'final, 'result) t =
{ jobs: int
(** number of jobs running in parallel, i.e. number of children we are responsible for *)
; slots: child_info Array.t
@ -75,7 +72,7 @@ type ('work, 'final) t =
; children_updates: Unix.File_descr.t
(** all the children send updates up the same pipe to the pool *)
; task_bar: TaskBar.t
; tasks: 'work TaskGenerator.t (** generator for work remaining to be done *)
; tasks: ('work, 'result) TaskGenerator.t (** generator for work remaining to be done *)
; file_lock: Utils.file_lock (** file lock for sending worker messages *) }
(** {2 Constants} *)
@ -99,11 +96,11 @@ let buffer_size = 65_535
the pipe will crash in the parent process. This is a limitation of the way we read from the pipe
for now. To lift it, it should be possible to extend the buffer to the required length if we
notice that we are trying to read more than [buffer_size] for example. *)
type worker_message =
type 'result worker_message =
| UpdateStatus of int * Mtime.t * string
(** [(i, t, status)]: starting a task from slot [i], at start time [t], with description
[status]. Watch out that [status] must not be too close in length to [buffer_size]. *)
| Ready of {worker: int; completed: bool}
| Ready of {worker: int; result: 'result}
(** Sent after finishing initializing or after finishing a given task. When received by
master, this moves the worker state from [Initializing] or [Processing _] to [Idle]. *)
| Crash of int (** there was an error and the child is no longer receiving messages *)
@ -265,12 +262,12 @@ let process_updates pool buffer =
(* clean crash, give the child process a chance to cleanup *)
Unix.wait (`Pid pid) |> ignore ;
killall pool ~slot "see backtrace above"
| Ready {worker= slot; completed} ->
| Ready {worker= slot; result} ->
( match pool.children_states.(slot) with
| Initializing ->
()
| Processing work ->
pool.tasks.finished ~completed work
pool.tasks.finished ~result work
| Idle ->
L.die InternalError "Received a Ready message from an idle worker@." ) ;
TaskBar.set_remaining_tasks pool.task_bar (pool.tasks.remaining_tasks ()) ;
@ -284,7 +281,7 @@ let process_updates pool buffer =
type 'a final_worker_message = Finished of int * 'a option | FinalCrash of int
let collect_results (pool : (_, 'final) t) =
let collect_results (pool : (_, 'final, _) t) =
let failed = ref false in
let updates_in = Unix.in_channel_of_descr pool.children_updates in
(* use [Array.init] just to collect n messages, the order in the array will not be the same as the
@ -332,9 +329,8 @@ let wait_all pool =
(** worker loop: wait for tasks and run [f] on them until we are told to go home *)
let rec child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilogue ~prev_completed
=
send_to_parent (Ready {worker= slot; completed= prev_completed}) ;
let rec child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilogue ~prev_result =
send_to_parent (Ready {worker= slot; result= prev_result}) ;
match receive_from_parent () with
| GoHome -> (
match epilogue () with
@ -352,23 +348,21 @@ let rec child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilo
true ) ) )
| Do stuff ->
let result =
try f stuff ; true with
| ProcnameAlreadyLocked ->
false
| e ->
IExn.reraise_if e ~f:(fun () ->
if Config.keep_going then (
L.internal_error "Error in subprocess %d: %a@." slot Exn.pp e ;
(* do not raise and continue accepting jobs *)
false )
else (
(* crash hard, but first let the master know that we have crashed *)
send_to_parent (Crash slot) ;
true ) ) ;
true
try f stuff
with e ->
IExn.reraise_if e ~f:(fun () ->
if Config.keep_going then (
L.internal_error "Error in subprocess %d: %a@." slot Exn.pp e ;
(* do not raise and continue accepting jobs *)
false )
else (
(* crash hard, but first let the master know that we have crashed *)
send_to_parent (Crash slot) ;
true ) ) ;
None
in
child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilogue
~prev_completed:result
~prev_result:result
(** Fork a new child and start it so that it is ready for work.
@ -387,7 +381,7 @@ let fork_child ~file_lock ~child_prelude ~slot (updates_r, updates_w) ~f ~epilog
ProcessPoolState.reset_pid () ;
child_prelude () ;
let updates_oc = Unix.out_channel_of_descr updates_w in
let send_to_parent (message : worker_message) =
let send_to_parent (message : 'b worker_message) =
marshal_to_pipe ~file_lock updates_oc message
in
let send_final (final_message : 'a final_worker_message) =
@ -414,8 +408,7 @@ let fork_child ~file_lock ~child_prelude ~slot (updates_r, updates_w) ~f ~epilog
PerfEvent.(log (fun logger -> log_end_event logger ())) ;
x
in
child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilogue
~prev_completed:true ;
child_loop ~slot send_to_parent send_final receive_from_parent ~f ~epilogue ~prev_result:None ;
Out_channel.close updates_oc ;
In_channel.close orders_ic ;
Epilogues.run () ;
@ -428,10 +421,10 @@ let fork_child ~file_lock ~child_prelude ~slot (updates_r, updates_w) ~f ~epilog
let create :
jobs:int
-> child_prelude:(unit -> unit)
-> f:('work -> unit)
-> f:('work -> 'result option)
-> child_epilogue:(unit -> 'final)
-> tasks:(unit -> 'work TaskGenerator.t)
-> ('work, 'final) t =
-> tasks:(unit -> ('work, 'result) TaskGenerator.t)
-> ('work, 'final, 'result) t =
fun ~jobs ~child_prelude ~f ~child_epilogue ~tasks ->
let file_lock = Utils.create_file_lock () in
let task_bar = TaskBar.create ~jobs in

@ -7,29 +7,29 @@
open! IStd
exception ProcnameAlreadyLocked
module TaskGenerator : sig
(** abstraction for generating jobs *)
type 'a t =
type ('a, 'b) t =
{ remaining_tasks: unit -> int
(** number of tasks remaining to complete -- only used for reporting, so imprecision is
not a bug *)
; is_empty: unit -> bool
(** when should the main loop of the task manager stop expecting new tasks *)
; finished: completed:bool -> 'a -> unit
(** Process pool calls [finished x] when a worker finishes item [x]. This is only called
if [next ()] has previously returned [Some x] and [x] was sent to a worker. *)
; finished: result:'b option -> 'a -> unit
(** Process pool calls [finished result:r x] when a worker finishes item [x]. result is
None when the item was completed successfully and Some pname when it failed because it
could not lock pname. This is only called if [next ()] has previously returned
[Some x] and [x] was sent to a worker. *)
; next: unit -> 'a option
(** [next ()] generates the next work item. If [is_empty ()] is true then [next ()] must
return [None]. However, it is OK to for [next ()] to return [None] when [is_empty] is
false. This corresponds to the case where there is more work to be done, but it is not
schedulable until some already scheduled work is finished. *) }
val chain : 'a t -> 'a t -> 'a t
val chain : ('a, 'b) t -> ('a, 'b) t -> ('a, 'b) t
(** chain two generators in order *)
val of_list : 'a list -> 'a t
val of_list : 'a list -> ('a, 'b) t
(** schedule tasks out of a concrete list *)
end
@ -51,17 +51,17 @@ end
(** A [('work, 'final) t] process pool accepts tasks of type ['work] and produces an array of
results of type ['final]. ['work] and ['final] will be marshalled over a Unix pipe.*)
type (_, _) t
type (_, _, _) t
val create :
jobs:int
-> child_prelude:(unit -> unit)
-> f:('work -> unit)
-> f:('work -> 'result option)
-> child_epilogue:(unit -> 'final)
-> tasks:(unit -> 'work TaskGenerator.t)
-> ('work, 'final) t
-> tasks:(unit -> ('work, 'result) TaskGenerator.t)
-> ('work, 'final, 'result) t
(** Create a new pool of processes running [jobs] jobs in parallel *)
val run : (_, 'final) t -> 'final option Array.t
val run : (_, 'final, 'result) t -> 'final option Array.t
(** use the processes in the given process pool to run all the given tasks in parallel and return
the results of the epilogues *)

@ -26,13 +26,14 @@ let create_cmd (source_file, (compilation_data : CompilationDatabase.compilation
let invoke_cmd (source_file, (cmd : CompilationDatabase.compilation_data)) =
let argv = cmd.executable :: cmd.escaped_arguments in
( match Spawn.spawn ~cwd:(Path cmd.directory) ~prog:cmd.executable ~argv () with
| pid ->
!ProcessPoolState.update_status (Mtime_clock.now ()) (SourceFile.to_string source_file) ;
Unix.waitpid (Pid.of_int pid)
|> Result.map_error ~f:(fun unix_error -> Unix.Exit_or_signal.to_string_hum (Error unix_error))
| exception Unix.Unix_error (err, f, arg) ->
Error (F.asprintf "%s(%s): %s@." f arg (Unix.Error.message err)) )
( ( match Spawn.spawn ~cwd:(Path cmd.directory) ~prog:cmd.executable ~argv () with
| pid ->
!ProcessPoolState.update_status (Mtime_clock.now ()) (SourceFile.to_string source_file) ;
Unix.waitpid (Pid.of_int pid)
|> Result.map_error ~f:(fun unix_error ->
Unix.Exit_or_signal.to_string_hum (Error unix_error) )
| exception Unix.Unix_error (err, f, arg) ->
Error (F.asprintf "%s(%s): %s@." f arg (Unix.Error.message err)) )
|> function
| Ok () ->
()
@ -42,7 +43,8 @@ let invoke_cmd (source_file, (cmd : CompilationDatabase.compilation_data)) =
else L.die ExternalError fmt
in
log_or_die "Error running compilation for '%a': %a:@\n%s@." SourceFile.pp source_file
Pp.cli_args argv error
Pp.cli_args argv error ) ;
None
let run_compilation_database compilation_database should_capture_file =

Loading…
Cancel
Save