Summary: The global type environment is loaded in all analysis workers, so it is best to minimise its memory footprint. `MaxSharing` does a great job at this but - it is very slow - it may be involved in observed segfaults, since it uses potentially unsafe operations This diff replaces `MaxSharing` with a post-facto poor-man's hashcons pass through the parts involved in a java type environment. It is not as efficient as `MaxSharing` at reducing footprint, but it's much faster. Reviewed By: jberdine Differential Revision: D24590400 fbshipit-source-id: c37100325master
parent
fcc27cf7a2
commit
4fb2fde4af
@ -0,0 +1,47 @@
|
||||
(*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*)
|
||||
open! Core
|
||||
|
||||
module type NormalizedT = sig
|
||||
include Caml.Hashtbl.HashedType
|
||||
|
||||
val normalize : t -> t
|
||||
end
|
||||
|
||||
module type S = sig
|
||||
type t
|
||||
|
||||
val normalize : t -> t
|
||||
|
||||
val reset : unit -> unit
|
||||
end
|
||||
|
||||
module Make (T : NormalizedT) = struct
|
||||
type t = T.t
|
||||
|
||||
module H = Caml.Hashtbl.Make (T)
|
||||
|
||||
let table : t H.t = H.create 11
|
||||
|
||||
let normalize t =
|
||||
match H.find_opt table t with
|
||||
| Some t' ->
|
||||
t'
|
||||
| None ->
|
||||
let normalized = T.normalize t in
|
||||
H.add table normalized normalized ;
|
||||
normalized
|
||||
|
||||
|
||||
let reset () = H.reset table
|
||||
end
|
||||
|
||||
module StringNormalizer = Make (struct
|
||||
include String
|
||||
|
||||
let normalize = Fn.id
|
||||
end)
|
@ -0,0 +1,30 @@
|
||||
(*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*)
|
||||
|
||||
(** a hashed type with a normalization function which respects equality *)
|
||||
module type NormalizedT = sig
|
||||
include Caml.Hashtbl.HashedType
|
||||
|
||||
val normalize : t -> t
|
||||
end
|
||||
|
||||
(** normalizer module which uses a hashtable to store normalized representatives *)
|
||||
module type S = sig
|
||||
(** type the normalizer works on *)
|
||||
type t
|
||||
|
||||
val normalize : t -> t
|
||||
(** return equal normalized representative *)
|
||||
|
||||
val reset : unit -> unit
|
||||
(** reset underlying hashtable *)
|
||||
end
|
||||
|
||||
module Make (T : NormalizedT) : S with type t = T.t
|
||||
|
||||
(** normalizer for strings *)
|
||||
module StringNormalizer : S with type t = string
|
@ -1,287 +0,0 @@
|
||||
(*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*)
|
||||
|
||||
open! IStd
|
||||
|
||||
exception MaximumSharingLazyValue
|
||||
|
||||
module Hashing : sig
|
||||
type hash_value = private int
|
||||
|
||||
type state
|
||||
|
||||
val of_int : int -> hash_value
|
||||
|
||||
val shallow : 'a -> hash_value
|
||||
(** A deterministic hash function that visits O(1) objects, to ensure termination on cyclic
|
||||
values. *)
|
||||
|
||||
val alloc_of_block : tag:int -> size:int -> state
|
||||
|
||||
val fold_hash_value : state -> hash_value -> state
|
||||
|
||||
val get_hash_value : state -> hash_value
|
||||
end = struct
|
||||
type hash_value = Hash.hash_value
|
||||
|
||||
type state = Hash.state
|
||||
|
||||
let of_int = Fn.id
|
||||
|
||||
let shallow =
|
||||
(*
|
||||
[hash x] is defined as [seeded_hash_param 10 100 0 x].
|
||||
Here we don't care about the specific numbers as long as the function is deterministic.
|
||||
*)
|
||||
Caml.Hashtbl.hash
|
||||
|
||||
|
||||
let alloc_of_block ~tag ~size =
|
||||
let state = Hash.alloc () in
|
||||
let state = Hash.fold_int state tag in
|
||||
let state = Hash.fold_int state size in
|
||||
state
|
||||
|
||||
|
||||
let fold_hash_value = Hash.fold_int
|
||||
|
||||
let get_hash_value = Hash.get_hash_value
|
||||
end
|
||||
|
||||
module Sharer : sig
|
||||
type t
|
||||
|
||||
val create : unit -> t
|
||||
|
||||
val normalize_value : t -> 'a -> 'a
|
||||
end = struct
|
||||
let hashed_obj eq =
|
||||
( module struct
|
||||
type t = Hashing.hash_value * Obj.t
|
||||
|
||||
let hash ((h, _) : t) = (h :> int)
|
||||
|
||||
let equal ((h1, o1) : t) ((h2, o2) : t) = Int.equal (h1 :> int) (h2 :> int) && eq o1 o2
|
||||
end : Caml.Hashtbl.HashedType
|
||||
with type t = Hashing.hash_value * Obj.t )
|
||||
|
||||
|
||||
module HashedNoscanBlock = (val hashed_obj Poly.equal)
|
||||
|
||||
module PhysEqualedHashedScannableBlock = (val hashed_obj phys_equal)
|
||||
|
||||
module HashedNormalizedScannableBlock = (val hashed_obj PhysEqual.shallow_equal)
|
||||
|
||||
module HNoscan = Caml.Hashtbl.Make (HashedNoscanBlock)
|
||||
module HPhysEq = Caml.Hashtbl.Make (PhysEqualedHashedScannableBlock)
|
||||
module HNorm = Caml.Hashtbl.Make (HashedNormalizedScannableBlock)
|
||||
|
||||
type visited =
|
||||
| Visiting of {mutable to_patch: (PhysEqualedHashedScannableBlock.t * int) list}
|
||||
| Normalized of HashedNormalizedScannableBlock.t
|
||||
|
||||
type t =
|
||||
{ noscan_blocks: HashedNoscanBlock.t HNoscan.t
|
||||
; visited_blocks: visited HPhysEq.t
|
||||
; hash_normalized: HashedNormalizedScannableBlock.t HNorm.t
|
||||
; fail_on_forward: bool
|
||||
; fail_on_nonstring: bool
|
||||
; fail_on_objects: bool }
|
||||
|
||||
let create () =
|
||||
{ noscan_blocks= HNoscan.create 1
|
||||
; visited_blocks= HPhysEq.create 1
|
||||
; hash_normalized= HNorm.create 1
|
||||
; (* these are just for safety because the code hasn't been tested on them, it should work fine though *)
|
||||
fail_on_forward= true
|
||||
; fail_on_nonstring= true
|
||||
; fail_on_objects= true }
|
||||
|
||||
|
||||
let hash_and_normalize_int o = (Hashing.of_int (Obj.obj o : int), o)
|
||||
|
||||
let dummy_should_not_be_hashed_or_used =
|
||||
(*
|
||||
Must be different than any block found in values.
|
||||
Must fail if hashed (there is actually no way to ensure that :( ))
|
||||
*)
|
||||
Obj.repr (lazy (assert false))
|
||||
|
||||
|
||||
(*
|
||||
TODO: be much more efficient and write it in C to be able to use the GC flags to
|
||||
mark visited values.
|
||||
*)
|
||||
let rec hash_and_normalize_obj sharer o parent_shallow_hash_block parent_field_i =
|
||||
if Obj.is_int o then hash_and_normalize_int o
|
||||
else hash_and_normalize_block sharer o parent_shallow_hash_block parent_field_i
|
||||
|
||||
|
||||
and hash_and_normalize_block sharer block parent_shallow_hash_block parent_field_i =
|
||||
let shallow_hash = Hashing.shallow block in
|
||||
let shallow_hash_block = (shallow_hash, block) in
|
||||
let tag = Obj.tag block in
|
||||
if tag >= Obj.no_scan_tag then (
|
||||
(*
|
||||
No-scan blocks (strings, int64, closures, weird stuff) are treated separately.
|
||||
They are hashed and compared using the Stdlib polymorphic functions.
|
||||
*)
|
||||
assert ((not sharer.fail_on_nonstring) || Int.equal tag Obj.string_tag) ;
|
||||
match HNoscan.find_opt sharer.noscan_blocks shallow_hash_block with
|
||||
| Some hash_normalized ->
|
||||
hash_normalized
|
||||
| None ->
|
||||
HNoscan.add sharer.noscan_blocks shallow_hash_block shallow_hash_block ;
|
||||
shallow_hash_block )
|
||||
else if Int.equal tag Obj.lazy_tag then
|
||||
(*
|
||||
For now MaximumSharing is used before marshalling.
|
||||
It makes little sense to marshal lazy values.
|
||||
Because lazy blocks are normal scannable blocks, this special case could be safely removed.
|
||||
*)
|
||||
raise MaximumSharingLazyValue
|
||||
else
|
||||
(*
|
||||
This is where we could win by mutating the value directly.
|
||||
Instead we need to use a hashtbl using a shallow hash (for termination), which adds a
|
||||
multiplicative factor to the running time.
|
||||
*)
|
||||
match HPhysEq.find_opt sharer.visited_blocks shallow_hash_block with
|
||||
| Some (Normalized hash_normalized) ->
|
||||
(* The block has already been visited, we can reuse the result. *)
|
||||
hash_normalized
|
||||
| Some (Visiting visiting) ->
|
||||
(*
|
||||
The block is being visited, which means we have a cycle.
|
||||
We record fields to be patched after we have finished treating the cycle.
|
||||
|
||||
For termination we have to return a shallow hash.
|
||||
We also need to return a phys_equally different value so that it will trigger
|
||||
copy-on-write on the whole cycle (then patch can safely be applied and in any order),
|
||||
even though it may not be necessary if the whole cycle and its dependencies could be
|
||||
kept as-is.
|
||||
The value that is returned should not be hashed or used. The current implementation
|
||||
respects it.
|
||||
*)
|
||||
visiting.to_patch <- (parent_shallow_hash_block, parent_field_i) :: visiting.to_patch ;
|
||||
(shallow_hash, dummy_should_not_be_hashed_or_used)
|
||||
| None ->
|
||||
let visited = Visiting {to_patch= []} in
|
||||
let[@warning "-8"] (Visiting visiting) = visited in
|
||||
HPhysEq.add sharer.visited_blocks shallow_hash_block visited ;
|
||||
let hash_normalized =
|
||||
if Int.equal tag Obj.forward_tag then (
|
||||
assert (not sharer.fail_on_forward) ;
|
||||
(*
|
||||
Forward_tag is an intermediate block resulting from the evaluating of a lazy.
|
||||
As an optimization, let's replace it directly with the normalization of the result
|
||||
as if this intermediate block didn't exist.
|
||||
|
||||
This remains untested for now (hence the assertion above).
|
||||
Not obvious to test as optimizations or the GC can already do the substitution.
|
||||
*)
|
||||
hash_and_normalize_obj sharer (Obj.field block 0) parent_shallow_hash_block
|
||||
parent_field_i )
|
||||
else (
|
||||
(* For regular blocks, normalize each field then use a shallow comparison. *)
|
||||
assert ((not sharer.fail_on_objects) || not (Int.equal tag Obj.object_tag)) ;
|
||||
let hash_shallow_normalized =
|
||||
let size = Obj.size block in
|
||||
hash_and_normalize_block_fields sharer shallow_hash_block block block size 0
|
||||
(Hashing.alloc_of_block ~tag ~size)
|
||||
in
|
||||
match HNorm.find_opt sharer.hash_normalized hash_shallow_normalized with
|
||||
| Some hash_normalized ->
|
||||
hash_normalized
|
||||
| None ->
|
||||
HNorm.add sharer.hash_normalized hash_shallow_normalized hash_shallow_normalized ;
|
||||
hash_shallow_normalized )
|
||||
in
|
||||
let hash_normalized =
|
||||
match visiting.to_patch with
|
||||
| [] (* not the head of a cycle *) ->
|
||||
hash_normalized
|
||||
| _ :: _ as to_patch ->
|
||||
(*
|
||||
The whole cycle has been treated, we now need to patch values that pointed to
|
||||
this block. We need to look them up in the [visited_blocks] hash table because
|
||||
they have been duplicated since we recorded them.
|
||||
*)
|
||||
let _, normalized = hash_normalized in
|
||||
List.iter to_patch ~f:(fun (hash_block_to_patch, field_i_to_patch) ->
|
||||
let normalized_block_to_patch =
|
||||
if phys_equal hash_block_to_patch shallow_hash_block then
|
||||
(* Self-cycle, e.g. [let rec x = 1 :: x]. No lookup! *)
|
||||
normalized
|
||||
else
|
||||
let[@warning "-8"] (Normalized (_, normalized_block_to_patch)) =
|
||||
HPhysEq.find sharer.visited_blocks hash_block_to_patch
|
||||
in
|
||||
normalized_block_to_patch
|
||||
in
|
||||
Obj.set_field normalized_block_to_patch field_i_to_patch normalized ) ;
|
||||
(*
|
||||
For cycle heads, for consistency with the [Visiting] case above we need to
|
||||
use the shallow hash.
|
||||
*)
|
||||
(shallow_hash, normalized)
|
||||
in
|
||||
HPhysEq.replace sharer.visited_blocks shallow_hash_block (Normalized hash_normalized) ;
|
||||
hash_normalized
|
||||
|
||||
|
||||
and hash_and_normalize_block_fields sharer original_shallow_hash_block original_block new_block
|
||||
size field_i hash_state =
|
||||
if field_i >= size then (Hashing.get_hash_value hash_state, new_block)
|
||||
else
|
||||
let field_v = Obj.field original_block field_i in
|
||||
let field_hash, field_v' =
|
||||
hash_and_normalize_obj sharer field_v original_shallow_hash_block field_i
|
||||
in
|
||||
let hash_state = Hashing.fold_hash_value hash_state field_hash in
|
||||
let new_block =
|
||||
if phys_equal field_v field_v' then new_block
|
||||
else
|
||||
let new_block =
|
||||
if phys_equal original_block new_block then (* copy-on-write *)
|
||||
Obj.dup original_block
|
||||
else new_block
|
||||
in
|
||||
Obj.set_field new_block field_i field_v' ;
|
||||
new_block
|
||||
in
|
||||
(hash_and_normalize_block_fields [@tailcall]) sharer original_shallow_hash_block
|
||||
original_block new_block size (field_i + 1) hash_state
|
||||
|
||||
|
||||
let dummy_should_not_be_patched =
|
||||
(Hashing.of_int 0, (* Make sure it fails hard if [Obj.set_field] is called on it *) Obj.repr 0)
|
||||
|
||||
|
||||
(** Returns a value structurally equal but with potentially more sharing. Potentially unsafe if
|
||||
used on mutable values that are modified later. Preserves polymorphic compare, hashing,
|
||||
no-sharing marshalling. May have an impact on code using [phys_equal] or marshalling with
|
||||
sharing. *)
|
||||
let normalize_value sharer v =
|
||||
hash_and_normalize_obj sharer (Obj.repr v) dummy_should_not_be_patched (-1) |> snd |> Obj.obj
|
||||
end
|
||||
|
||||
module ForHashtbl (H : Caml.Hashtbl.S) = struct
|
||||
let normalize h =
|
||||
let sharer = Sharer.create () in
|
||||
(* If a hash table has been created with [add] and not [replace] only, it is possible to
|
||||
have several values for a given key. We need to collect them all and reinsert them in
|
||||
the reverse order. *)
|
||||
let rev_bindings = H.fold (fun k v acc -> (k, v) :: acc) h [] in
|
||||
(* No need to preserve the initial size of the original hash table *)
|
||||
let h' = H.create (H.length h) in
|
||||
List.iter rev_bindings ~f:(fun (k, v) ->
|
||||
let k' = Sharer.normalize_value sharer k in
|
||||
let v' = Sharer.normalize_value sharer v in
|
||||
H.add h' k' v' ) ;
|
||||
h'
|
||||
end
|
@ -1,23 +0,0 @@
|
||||
(*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*)
|
||||
|
||||
open! IStd
|
||||
|
||||
(** Current implementation will stack overflow on deep values (TODO: a tailrec version). *)
|
||||
|
||||
module Sharer : sig
|
||||
type t
|
||||
|
||||
val create : unit -> t
|
||||
|
||||
val normalize_value : t -> 'a -> 'a
|
||||
end
|
||||
|
||||
module ForHashtbl (H : Caml.Hashtbl.S) : sig
|
||||
val normalize : 'a H.t -> 'a H.t
|
||||
(** Duplicate a hash table with maximum sharing. *)
|
||||
end
|
@ -1,79 +0,0 @@
|
||||
(*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*)
|
||||
|
||||
open! IStd
|
||||
open OUnit2
|
||||
|
||||
let inputs =
|
||||
let a = `A 'a' in
|
||||
let b = Array.create ~len:10_000 a in
|
||||
let c = Array.create ~len:1_000 b in
|
||||
let d = Array.create ~len:1_000 c in
|
||||
let rec e = 1 :: e in
|
||||
let rec f = 1 :: 2 :: f in
|
||||
[ ("unit", Obj.repr (), `PhysEqual)
|
||||
; ("same representation", Obj.repr ([42], [|42; 0|]), `MarshalNoSharing_MustBeBetter)
|
||||
; ("10K times the same element", Obj.repr b, `PhysEqual)
|
||||
; ("1K times 10K times the same element", Obj.repr c, `PhysEqual)
|
||||
; ("1K times 1K times 10K times the same element", Obj.repr d, `PhysEqual)
|
||||
; ("Self cycle", Obj.repr e, `MarshalWithSharing (* ideally `PhysEqual *))
|
||||
; ("Cyclic value", Obj.repr f, `MarshalWithSharing (* ideally `PhysEqual *)) ]
|
||||
|
||||
|
||||
let tests =
|
||||
let normalize input =
|
||||
let sharer = MaximumSharing.Sharer.create () in
|
||||
MaximumSharing.Sharer.normalize_value sharer input
|
||||
in
|
||||
let test_one input checks _ =
|
||||
(* Save this now, in case `MaximumSharing` mutates the [input], even though it shouldn't *)
|
||||
let serialized_input_with_sharing = Marshal.to_string input [] in
|
||||
let serialized_input_no_sharing =
|
||||
match checks with
|
||||
| `PhysEqual | `MarshalWithSharing ->
|
||||
"UNUSED"
|
||||
| `MarshalNoSharing_MustBeBetter ->
|
||||
(* OOMs for big or cyclic values *)
|
||||
Marshal.to_string input [Marshal.No_sharing]
|
||||
in
|
||||
let reachable_words_input = Obj.reachable_words input in
|
||||
let normalized = normalize input in
|
||||
(*
|
||||
We can't really check [input] hasn't been mutated but its marshalling with sharing
|
||||
shouldn't have changed
|
||||
*)
|
||||
let serialized_input_with_sharing' = Marshal.to_string input [] in
|
||||
assert_equal serialized_input_with_sharing serialized_input_with_sharing' ;
|
||||
(*
|
||||
The whole goal of [MaximumSharing] is to reduce the memory footprint.
|
||||
Let's make sure this contract is fulfilled.
|
||||
There is no guarantee the serialized version will be smaller, e.g.
|
||||
[let x = Some 0 in (x, Array.init 254 (fun i -> Some i), x)]
|
||||
is smaller with no sharing.
|
||||
*)
|
||||
let reachable_words_normalized = Obj.reachable_words normalized in
|
||||
assert_bool "less reachable words" (reachable_words_normalized <= reachable_words_input) ;
|
||||
(* Cannot use [assert_equal] because it doesn't shortcut physical equalities *)
|
||||
match checks with
|
||||
| `PhysEqual ->
|
||||
assert_bool "phys_equal" (phys_equal input normalized)
|
||||
| `MarshalWithSharing ->
|
||||
let serialized_normalized_with_sharing = Marshal.to_string normalized [] in
|
||||
assert_equal serialized_input_with_sharing serialized_normalized_with_sharing
|
||||
| `MarshalNoSharing_MustBeBetter ->
|
||||
assert_bool "equal" (Poly.equal input normalized) ;
|
||||
assert_bool "strictly less reachable words"
|
||||
(reachable_words_normalized < reachable_words_input) ;
|
||||
(*
|
||||
In case structural equality and marshalling have slightly different semantics,
|
||||
let's also make sure the serialized versions are indistinguishable
|
||||
*)
|
||||
let serialized_normalized_no_sharing = Marshal.to_string normalized [Marshal.No_sharing] in
|
||||
assert_equal serialized_input_no_sharing serialized_normalized_no_sharing
|
||||
in
|
||||
let tests_ = List.map inputs ~f:(fun (name, input, checks) -> name >:: test_one input checks) in
|
||||
"MaximumSharing_tests" >::: tests_
|
Loading…
Reference in new issue