From 013d15353820e3769840a1cc64843d65843da32b Mon Sep 17 00:00:00 2001 From: Nikos Gorogiannis Date: Mon, 17 Jun 2019 13:07:30 -0700 Subject: [PATCH] [buck/java2] hashcons the global tenv during merging Summary: One "interesting" feature of the approach of merging the captured targets in Java, is that we union their type environments, as opposed to store partial tenvs together with each source file, which is the case for Clang. This means - the final global type environment is potentially huge because it contains all the types in all targets. - all analysis workers start by loading that tenv in memory, meaning we consume `|size of tenv| x #cpus` memory, which can tip the balance towards OOMs This diff attempts to economise on global tenv size. This is done by increasing sharing which is then preserved by marshalling. It's done in a brute force way, with hashtables for each struct component, and is not fully effective due to the recursion amongst types and types names, as well types appearing inside other constructs such as procnames. This is done when calling `Tenv.store` so that - the computation can be parallelised somewhat (capture is parallel, merging is not) - buck caching will benefit from smaller tenvs. This saves about 24% of total memory devoted to the type environment. Reviewed By: mbouaziz Differential Revision: D15840054 fbshipit-source-id: 6f03be1a4 --- infer/src/IR/Annot.ml | 4 + infer/src/IR/Annot.mli | 4 + infer/src/IR/Tenv.ml | 145 ++++++++++++++++++++++++++++-- infer/src/IR/Typ.ml | 6 ++ infer/src/IR/Typ.mli | 7 ++ infer/src/backend/mergeCapture.ml | 4 +- 6 files changed, 161 insertions(+), 9 deletions(-) diff --git a/infer/src/IR/Annot.ml b/infer/src/IR/Annot.ml index 37e753dfd..16ca03fbe 100644 --- a/infer/src/IR/Annot.ml +++ b/infer/src/IR/Annot.ml @@ -19,6 +19,10 @@ type t = ; parameters: parameters (** currently only one string parameter *) } [@@deriving compare] +let equal = [%compare.equal: t] + +let hash = Hashtbl.hash + let volatile = {class_name= "volatile"; parameters= []} let final = {class_name= "final"; parameters= []} diff --git a/infer/src/IR/Annot.mli b/infer/src/IR/Annot.mli index 6e3e7b8b2..50fba1d72 100644 --- a/infer/src/IR/Annot.mli +++ b/infer/src/IR/Annot.mli @@ -19,6 +19,10 @@ type t = ; parameters: parameters (** currently only one string parameter *) } [@@deriving compare] +val equal : t -> t -> bool + +val hash : t -> int + val volatile : t (** annotation for fields marked with the "volatile" keyword *) diff --git a/infer/src/IR/Tenv.ml b/infer/src/IR/Tenv.ml index e7a3089c3..10be5d4f3 100644 --- a/infer/src/IR/Tenv.ml +++ b/infer/src/IR/Tenv.ml @@ -10,14 +10,8 @@ module L = Logging (** Module for Type Environments. *) -(** Hash tables on strings. *) -module TypenameHash = Hashtbl.Make (struct - type t = Typ.Name.t - - let equal tn1 tn2 = Typ.Name.equal tn1 tn2 - - let hash = Hashtbl.hash -end) +(** Hash tables on type names. *) +module TypenameHash = Hashtbl.Make (Typ.Name) (** Type for type environment. *) type t = Typ.Struct.t TypenameHash.t @@ -163,8 +157,143 @@ let store_to_filename tenv tenv_filename = if Config.debug_mode then store_debug_file tenv tenv_filename +(** Use a prime for the initial hashtable size, since, typically, we expect a lot of types. + Prime sizes make hash functions happier across table resizes. *) +let medium_size_prime = 1003 + +module type HashconsS = sig + module Hashtable : Caml.Hashtbl.S + + val reset : unit -> unit + + val hashcons : Hashtable.key -> Hashtable.key +end + +module Hashcons (H : Caml.Hashtbl.S) : HashconsS with module Hashtable = H = struct + module Hashtable = H + + let reset, hashcons = + let tbl : H.key H.t = H.create medium_size_prime in + ( (fun () -> H.reset tbl) + , fun x -> match H.find_opt tbl x with Some x' -> x' | None -> H.add tbl x x ; x ) +end + +module HashconsList (T : Caml.Hashtbl.HashedType) (H : HashconsS with type Hashtable.key = T.t) : + HashconsS with type Hashtable.key = T.t list = struct + module Hashtable = Hashtbl.Make (struct + type t = T.t list + + let hash = Hashtbl.hash + + let equal (x : t) (y : t) = + let open Polymorphic_compare in + x = y + end) + + let reset, hashcons = + let tbl : T.t list Hashtable.t = Hashtable.create medium_size_prime in + ( (fun () -> Hashtable.reset tbl) + , fun x -> + let x = IList.map_changed x ~equal:phys_equal ~f:H.hashcons in + match Hashtable.find_opt tbl x with Some x' -> x' | None -> Hashtable.add tbl x x ; x ) +end + +module PnameHC = Hashcons (Typ.Procname.Hash) +module PnameListHC = HashconsList (Typ.Procname) (PnameHC) +module TnameHC = Hashcons (TypenameHash) +module TnameListHC = HashconsList (Typ.Name) (TnameHC) +module StringHC = Hashcons (Hashtbl.Make (String)) +module StringListHC = HashconsList (String) (StringHC) +module FieldnameHC = Hashcons (Hashtbl.Make (Typ.Fieldname)) +module TypHC = Hashcons (Hashtbl.Make (Typ)) + +module AnnotHC = struct + include Hashcons (Hashtbl.Make (Annot)) + + let hashcons ({class_name; parameters} : Annot.t) : Annot.t = + {class_name= StringHC.hashcons class_name; parameters= StringListHC.hashcons parameters} +end + +module AnnotVis = struct + type t = Annot.t * bool [@@deriving compare] + + let equal = [%compare.equal: t] + + let hash = Hashtbl.hash +end + +module AnnotVisHC = struct + include Hashcons (Hashtbl.Make (AnnotVis)) + + let hashcons (annot, visibility) = hashcons (AnnotHC.hashcons annot, visibility) +end + +module AnnotItemHC = HashconsList (AnnotVis) (AnnotVisHC) + +module Field = struct + type t = Typ.Fieldname.t * Typ.t * Annot.Item.t [@@deriving compare] + + let equal = [%compare.equal: t] + + let hash = Hashtbl.hash +end + +module FieldHC = struct + include Hashcons (Hashtbl.Make (Field)) + + let hashcons (fieldname, typ, annot_item) = + hashcons (FieldnameHC.hashcons fieldname, TypHC.hashcons typ, AnnotItemHC.hashcons annot_item) +end + +module FieldListHC = HashconsList (Field) (FieldHC) + +let reset_hashtables () = + PnameHC.reset () ; + PnameListHC.reset () ; + TnameHC.reset () ; + TnameListHC.reset () ; + StringHC.reset () ; + StringListHC.reset () ; + FieldnameHC.reset () ; + TypHC.reset () ; + AnnotHC.reset () ; + AnnotVisHC.reset () ; + AnnotItemHC.reset () ; + FieldHC.reset () ; + FieldListHC.reset () + + +(** Global tenv size is a problem in the genrule capture integration for java. + This function tries to improve sharing of values in the tenv, and assumes + Java data structures (it's still correct for Clangs, just not necessarily + as effective. *) +let canonicalize tenv = + reset_hashtables () ; + let result = create () in + let canonicalize_one tname + ({fields; statics; supers; methods; exported_objc_methods; annots} : Typ.Struct.t) = + let tname = TnameHC.hashcons tname in + let tstruct = + Typ.Struct.internal_mk_struct ~supers:(TnameListHC.hashcons supers) + ~fields:(FieldListHC.hashcons fields) ~statics:(FieldListHC.hashcons statics) + ~methods:(PnameListHC.hashcons methods) + ~exported_objc_methods:(PnameListHC.hashcons exported_objc_methods) + ~annots:(AnnotItemHC.hashcons annots) () + in + TypenameHash.add result tname tstruct + in + TypenameHash.iter canonicalize_one tenv ; + reset_hashtables () ; + result + + let store_global tenv = (* update in-memory global tenv for later uses by this process, e.g. in single-core mode the frontend and backend run in the same process *) + L.debug Capture Quiet "Tenv.store: global tenv has size %d bytes.@." + (Obj.(reachable_words (repr tenv)) * (Sys.word_size / 8)) ; + let tenv = canonicalize tenv in + L.debug Capture Quiet "Tenv.store: canonicalized tenv has size %d bytes.@." + (Obj.(reachable_words (repr tenv)) * (Sys.word_size / 8)) ; global_tenv := Some tenv ; store_to_filename tenv global_tenv_path diff --git a/infer/src/IR/Typ.ml b/infer/src/IR/Typ.ml index 684bd0c30..871fcddc9 100644 --- a/infer/src/IR/Typ.ml +++ b/infer/src/IR/Typ.ml @@ -231,6 +231,8 @@ end include T +let hash = Hashtbl.hash + let mk_type_quals ?default ?is_const ?is_restrict ?is_volatile () = let default_ = {is_const= false; is_restrict= false; is_volatile= false} in let mk_aux ?(default = default_) ?(is_const = default.is_const) @@ -349,6 +351,8 @@ module Name = struct let equal = [%compare.equal: t] + let hash = Hashtbl.hash + let qual_name = function | CStruct name | CUnion name | ObjcClass name | ObjcProtocol name -> name @@ -1392,6 +1396,8 @@ module Fieldname = struct let equal = [%compare.equal: t] + let hash = Hashtbl.hash + module T = struct type nonrec t = t diff --git a/infer/src/IR/Typ.mli b/infer/src/IR/Typ.mli index 179ec0ba9..bbd23408a 100644 --- a/infer/src/IR/Typ.mli +++ b/infer/src/IR/Typ.mli @@ -127,6 +127,8 @@ and template_spec_info = ; args: template_arg list } [@@deriving compare] +val hash : t -> int + val pp_template_spec_info : Pp.env -> F.formatter -> template_spec_info -> unit [@@warning "-32"] val mk : ?default:t -> ?quals:type_quals -> desc -> t @@ -157,6 +159,8 @@ module Name : sig val equal : t -> t -> bool (** Equality for typenames *) + val hash : t -> int + val to_string : t -> string (** convert the typename to a string *) @@ -623,6 +627,9 @@ module Fieldname : sig type t [@@deriving compare] val equal : t -> t -> bool + + val hash : t -> int + (** Equality for field names. *) (** Set for fieldnames *) diff --git a/infer/src/backend/mergeCapture.ml b/infer/src/backend/mergeCapture.ml index 32823cf1b..d806dffcb 100644 --- a/infer/src/backend/mergeCapture.ml +++ b/infer/src/backend/mergeCapture.ml @@ -11,6 +11,7 @@ module L = Logging (** Module to merge the results of capture for different buck targets. *) let merge_global_tenvs infer_deps_file = + let time0 = Mtime_clock.counter () in let global_tenv = Tenv.create () in let merge infer_out_src = let global_tenv_path = @@ -20,7 +21,8 @@ let merge_global_tenvs infer_deps_file = |> Option.iter ~f:(fun tenv -> Tenv.merge ~src:tenv ~dst:global_tenv) in MergeResults.iter_infer_deps infer_deps_file ~f:merge ; - Tenv.store_global global_tenv + Tenv.store_global global_tenv ; + L.progress "Merging type environments took %a@." Mtime.Span.pp (Mtime_clock.count time0) let merge_captured_targets () =