[buck/java2] hashcons the global tenv during merging

Summary: One "interesting" feature of the approach of merging the captured targets in Java, is that we union their type environments, as opposed to store partial tenvs together with each source file, which is the case for Clang. This means - the final global type environment is potentially huge because it contains all the types in all targets. - all analysis workers start by loading that tenv in memory, meaning we consume `|size of tenv| x #cpus` memory, which can tip the balance towards OOMs This diff attempts to economise on global tenv size. This is done by increasing sharing which is then preserved by marshalling. It's done in a brute force way, with hashtables for each struct component, and is not fully effective due to the recursion amongst types and types names, as well types appearing inside other constructs such as procnames. This is done when calling `Tenv.store` so that - the computation can be parallelised somewhat (capture is parallel, merging is not) - buck caching will benefit from smaller tenvs. This saves about 24% of total memory devoted to the type environment. Reviewed By: mbouaziz Differential Revision: D15840054 fbshipit-source-id: 6f03be1a4
7 years ago · 013d153538
parent 8776a31f7d
commit 013d153538
6 changed files with 161 additions and 9 deletions
--- a/infer/src/IR/Annot.ml
+++ b/infer/src/IR/Annot.ml
@ -19,6 +19,10 @@ type t =
  ; parameters: parameters  (** currently only one string parameter *) }
 [@@deriving compare]

+let equal = [%compare.equal: t]
+
+let hash = Hashtbl.hash
+
 let volatile = {class_name= "volatile"; parameters= []}

 let final = {class_name= "final"; parameters= []}
--- a/infer/src/IR/Annot.mli
+++ b/infer/src/IR/Annot.mli
@ -19,6 +19,10 @@ type t =
  ; parameters: parameters  (** currently only one string parameter *) }
 [@@deriving compare]

+val equal : t -> t -> bool
+
+val hash : t -> int
+
 val volatile : t
 (** annotation for fields marked with the "volatile" keyword *)

--- a/infer/src/IR/Tenv.ml
+++ b/infer/src/IR/Tenv.ml
@ -10,14 +10,8 @@ module L = Logging

 (** Module for Type Environments. *)

-(** Hash tables on strings. *)
-module TypenameHash = Hashtbl.Make (struct
-  type t = Typ.Name.t
-
-  let equal tn1 tn2 = Typ.Name.equal tn1 tn2
-
-  let hash = Hashtbl.hash
-end)
+(** Hash tables on type names. *)
+module TypenameHash = Hashtbl.Make (Typ.Name)

 (** Type for type environment. *)
 type t = Typ.Struct.t TypenameHash.t
@ -163,8 +157,143 @@ let store_to_filename tenv tenv_filename =
  if Config.debug_mode then store_debug_file tenv tenv_filename


+(** Use a prime for the initial hashtable size, since, typically, we expect a lot of types. 
+    Prime sizes make hash functions happier across table resizes. *)
+let medium_size_prime = 1003
+
+module type HashconsS = sig
+  module Hashtable : Caml.Hashtbl.S
+
+  val reset : unit -> unit
+
+  val hashcons : Hashtable.key -> Hashtable.key
+end
+
+module Hashcons (H : Caml.Hashtbl.S) : HashconsS with module Hashtable = H = struct
+  module Hashtable = H
+
+  let reset, hashcons =
+    let tbl : H.key H.t = H.create medium_size_prime in
+    ( (fun () -> H.reset tbl)
+    , fun x -> match H.find_opt tbl x with Some x' -> x' | None -> H.add tbl x x ; x )
+end
+
+module HashconsList (T : Caml.Hashtbl.HashedType) (H : HashconsS with type Hashtable.key = T.t) :
+  HashconsS with type Hashtable.key = T.t list = struct
+  module Hashtable = Hashtbl.Make (struct
+    type t = T.t list
+
+    let hash = Hashtbl.hash
+
+    let equal (x : t) (y : t) =
+      let open Polymorphic_compare in
+      x = y
+  end)
+
+  let reset, hashcons =
+    let tbl : T.t list Hashtable.t = Hashtable.create medium_size_prime in
+    ( (fun () -> Hashtable.reset tbl)
+    , fun x ->
+        let x = IList.map_changed x ~equal:phys_equal ~f:H.hashcons in
+        match Hashtable.find_opt tbl x with Some x' -> x' | None -> Hashtable.add tbl x x ; x )
+end
+
+module PnameHC = Hashcons (Typ.Procname.Hash)
+module PnameListHC = HashconsList (Typ.Procname) (PnameHC)
+module TnameHC = Hashcons (TypenameHash)
+module TnameListHC = HashconsList (Typ.Name) (TnameHC)
+module StringHC = Hashcons (Hashtbl.Make (String))
+module StringListHC = HashconsList (String) (StringHC)
+module FieldnameHC = Hashcons (Hashtbl.Make (Typ.Fieldname))
+module TypHC = Hashcons (Hashtbl.Make (Typ))
+
+module AnnotHC = struct
+  include Hashcons (Hashtbl.Make (Annot))
+
+  let hashcons ({class_name; parameters} : Annot.t) : Annot.t =
+    {class_name= StringHC.hashcons class_name; parameters= StringListHC.hashcons parameters}
+end
+
+module AnnotVis = struct
+  type t = Annot.t * bool [@@deriving compare]
+
+  let equal = [%compare.equal: t]
+
+  let hash = Hashtbl.hash
+end
+
+module AnnotVisHC = struct
+  include Hashcons (Hashtbl.Make (AnnotVis))
+
+  let hashcons (annot, visibility) = hashcons (AnnotHC.hashcons annot, visibility)
+end
+
+module AnnotItemHC = HashconsList (AnnotVis) (AnnotVisHC)
+
+module Field = struct
+  type t = Typ.Fieldname.t * Typ.t * Annot.Item.t [@@deriving compare]
+
+  let equal = [%compare.equal: t]
+
+  let hash = Hashtbl.hash
+end
+
+module FieldHC = struct
+  include Hashcons (Hashtbl.Make (Field))
+
+  let hashcons (fieldname, typ, annot_item) =
+    hashcons (FieldnameHC.hashcons fieldname, TypHC.hashcons typ, AnnotItemHC.hashcons annot_item)
+end
+
+module FieldListHC = HashconsList (Field) (FieldHC)
+
+let reset_hashtables () =
+  PnameHC.reset () ;
+  PnameListHC.reset () ;
+  TnameHC.reset () ;
+  TnameListHC.reset () ;
+  StringHC.reset () ;
+  StringListHC.reset () ;
+  FieldnameHC.reset () ;
+  TypHC.reset () ;
+  AnnotHC.reset () ;
+  AnnotVisHC.reset () ;
+  AnnotItemHC.reset () ;
+  FieldHC.reset () ;
+  FieldListHC.reset ()
+
+
+(** Global tenv size is a problem in the genrule capture integration for java. 
+    This function tries to improve sharing of values in the tenv, and assumes 
+    Java data structures (it's still correct for Clangs, just not necessarily 
+    as effective. *)
+let canonicalize tenv =
+  reset_hashtables () ;
+  let result = create () in
+  let canonicalize_one tname
+      ({fields; statics; supers; methods; exported_objc_methods; annots} : Typ.Struct.t) =
+    let tname = TnameHC.hashcons tname in
+    let tstruct =
+      Typ.Struct.internal_mk_struct ~supers:(TnameListHC.hashcons supers)
+        ~fields:(FieldListHC.hashcons fields) ~statics:(FieldListHC.hashcons statics)
+        ~methods:(PnameListHC.hashcons methods)
+        ~exported_objc_methods:(PnameListHC.hashcons exported_objc_methods)
+        ~annots:(AnnotItemHC.hashcons annots) ()
+    in
+    TypenameHash.add result tname tstruct
+  in
+  TypenameHash.iter canonicalize_one tenv ;
+  reset_hashtables () ;
+  result
+
+
 let store_global tenv =
  (* update in-memory global tenv for later uses by this process, e.g. in single-core mode the
     frontend and backend run in the same process *)
+  L.debug Capture Quiet "Tenv.store: global tenv has size %d bytes.@."
+    (Obj.(reachable_words (repr tenv)) * (Sys.word_size / 8)) ;
+  let tenv = canonicalize tenv in
+  L.debug Capture Quiet "Tenv.store: canonicalized tenv has size %d bytes.@."
+    (Obj.(reachable_words (repr tenv)) * (Sys.word_size / 8)) ;
  global_tenv := Some tenv ;
  store_to_filename tenv global_tenv_path
--- a/infer/src/IR/Typ.ml
+++ b/infer/src/IR/Typ.ml
@ -231,6 +231,8 @@ end

 include T

+let hash = Hashtbl.hash
+
 let mk_type_quals ?default ?is_const ?is_restrict ?is_volatile () =
  let default_ = {is_const= false; is_restrict= false; is_volatile= false} in
  let mk_aux ?(default = default_) ?(is_const = default.is_const)
@ -349,6 +351,8 @@ module Name = struct

  let equal = [%compare.equal: t]

+  let hash = Hashtbl.hash
+
  let qual_name = function
    | CStruct name | CUnion name | ObjcClass name | ObjcProtocol name ->
        name
@ -1392,6 +1396,8 @@ module Fieldname = struct

  let equal = [%compare.equal: t]

+  let hash = Hashtbl.hash
+
  module T = struct
    type nonrec t = t

--- a/infer/src/IR/Typ.mli
+++ b/infer/src/IR/Typ.mli
@ -127,6 +127,8 @@ and template_spec_info =
      ; args: template_arg list }
 [@@deriving compare]

+val hash : t -> int
+
 val pp_template_spec_info : Pp.env -> F.formatter -> template_spec_info -> unit [@@warning "-32"]

 val mk : ?default:t -> ?quals:type_quals -> desc -> t
@ -157,6 +159,8 @@ module Name : sig
  val equal : t -> t -> bool
  (** Equality for typenames *)

+  val hash : t -> int
+
  val to_string : t -> string
  (** convert the typename to a string *)

@ -623,6 +627,9 @@ module Fieldname : sig
  type t [@@deriving compare]

  val equal : t -> t -> bool
+
+  val hash : t -> int
+
  (** Equality for field names. *)

  (** Set for fieldnames *)
--- a/infer/src/backend/mergeCapture.ml
+++ b/infer/src/backend/mergeCapture.ml
@ -11,6 +11,7 @@ module L = Logging
 (** Module to merge the results of capture for different buck targets. *)

 let merge_global_tenvs infer_deps_file =
+  let time0 = Mtime_clock.counter () in
  let global_tenv = Tenv.create () in
  let merge infer_out_src =
    let global_tenv_path =
@ -20,7 +21,8 @@ let merge_global_tenvs infer_deps_file =
    |> Option.iter ~f:(fun tenv -> Tenv.merge ~src:tenv ~dst:global_tenv)
  in
  MergeResults.iter_infer_deps infer_deps_file ~f:merge ;
-  Tenv.store_global global_tenv
+  Tenv.store_global global_tenv ;
+  L.progress "Merging type environments took %a@." Mtime.Span.pp (Mtime_clock.count time0)


 let merge_captured_targets () =