Maximum sharing: reuse sharing and detect cycles

Summary:
The previous version had a potentially exponential behavior on values with already lots of sharing.
This is fixed here at the price of a multiplicative constant factor (cost of `Hashtbl.hash`).
It also prepares for the handling of cycles.

Reviewed By: ngorogiannis

Differential Revision: D16016906

fbshipit-source-id: 611287917
master
Mehdi Bouaziz 6 years ago committed by Facebook Github Bot
parent 09efe4f2c1
commit 39c7ab86e1

@ -7,6 +7,8 @@
open! IStd
exception MaximumSharingCyclicValue
exception MaximumSharingLazyValue
module Hashing : sig
@ -16,7 +18,8 @@ module Hashing : sig
val of_int : int -> hash_value
val of_no_scan_block : 'a -> hash_value
val shallow : 'a -> hash_value
(** A deterministic hash function that visits O(1) objects, to ensure termination on cyclic values. *)
val alloc_of_block : tag:int -> size:int -> state
@ -30,7 +33,13 @@ end = struct
let of_int = Fn.id
let of_no_scan_block = Caml.Hashtbl.hash
let shallow =
(*
[hash x] is defined as [seeded_hash_param 10 100 0 x].
Here we don't care about the specific numbers as long as the function is deterministic.
*)
Caml.Hashtbl.hash
let alloc_of_block ~tag ~size =
let state = Hash.alloc () in
@ -51,77 +60,134 @@ module Sharer : sig
val normalize_value : t -> 'a -> 'a
end = struct
module HashedNormalizedObj = struct
type t = Hashing.hash_value * Obj.t
let hashed_obj eq =
( module struct
type t = Hashing.hash_value * Obj.t
let hash ((h, _) : t) = (h :> int)
let equal ((h1, o1) : t) ((h2, o2) : t) = Int.equal (h1 :> int) (h2 :> int) && eq o1 o2
end
: Caml.Hashtbl.HashedType
with type t = Hashing.hash_value * Obj.t )
module HashedNoscanBlock = (val hashed_obj Polymorphic_compare.equal)
let equal ((h1, o1) : t) ((h2, o2) : t) =
Int.equal (h1 :> int) (h2 :> int)
&&
if Obj.tag o1 >= Obj.no_scan_tag then Polymorphic_compare.equal o1 o2
else PhysEqual.shallow_equal o1 o2
module PhysEqualedHashedScannableBlock = (val hashed_obj phys_equal)
module HashedNormalizedScannableBlock = (val hashed_obj PhysEqual.shallow_equal)
let hash ((h, _) : t) = (h :> int)
end
module HNoscan = Caml.Hashtbl.Make (HashedNoscanBlock)
module HPhysEq = Caml.Hashtbl.Make (PhysEqualedHashedScannableBlock)
module HNorm = Caml.Hashtbl.Make (HashedNormalizedScannableBlock)
module H = Caml.Hashtbl.Make (HashedNormalizedObj)
type visited = Visiting | Normalized of HashedNormalizedScannableBlock.t
type t =
{ inplace: bool
(** Uses [Obj.set_field] on possibly immutable values, hence should be avoided when used with flambda *)
; hash_normalized: HashedNormalizedObj.t H.t
; noscan_blocks: HashedNoscanBlock.t HNoscan.t
; visited_blocks: visited HPhysEq.t
; hash_normalized: HashedNormalizedScannableBlock.t HNorm.t
; fail_on_forward: bool
; fail_on_nonstring: bool
; fail_on_objects: bool }
let create () =
{ inplace= false
; hash_normalized= H.create 1
; noscan_blocks= HNoscan.create 1
; visited_blocks= HPhysEq.create 1
; hash_normalized= HNorm.create 1
; (* these are just for safety because the code hasn't been tested on them, it should work fine though *)
fail_on_forward= true
; fail_on_nonstring= true
; fail_on_objects= true }
let normalize_block sharer hash block =
let hash_block = (hash, block) in
match H.find_opt sharer.hash_normalized hash_block with
| Some hash_normalized ->
hash_normalized
| None ->
H.add sharer.hash_normalized hash_block hash_block ;
hash_block
let hash_and_normalize_int o = (Hashing.of_int (Obj.obj o : int), o)
(*
TODO: currently the function explores the graph without sharing. It is possible to build values
on which this will behave exponentially. This could be avoided by keeping a hashtbl of visited
values. But it would be much more efficient to write it in C to be able to use the GC flags to
mark visited values.
TODO: be much more efficient and write it in C to be able to use the GC flags to
mark visited values.
*)
let rec hash_and_normalize_obj sharer o =
if Obj.is_int o then hash_and_normalize_int o else hash_and_normalize_block sharer o
and hash_and_normalize_block sharer block =
let shallow_hash = Hashing.shallow block in
let shallow_hash_block = (shallow_hash, block) in
let tag = Obj.tag block in
if tag >= Obj.no_scan_tag then (
(*
No-scan blocks (strings, int64, closures, weird stuff) are treated separately.
They are hashed and compared using the Stdlib polymorphic functions.
*)
assert ((not sharer.fail_on_nonstring) || Int.equal tag Obj.string_tag) ;
hash_and_normalize_no_scan_block sharer block )
else if Int.equal tag Obj.forward_tag then (
assert (not sharer.fail_on_forward) ;
hash_and_normalize_obj sharer (Obj.field block 0) )
else if Int.equal tag Obj.lazy_tag then raise MaximumSharingLazyValue
else (
assert ((not sharer.fail_on_objects) || not (Int.equal tag Obj.object_tag)) ;
let size = Obj.size block in
hash_and_normalize_block_fields sharer block block size 0 (Hashing.alloc_of_block ~tag ~size) )
match HNoscan.find_opt sharer.noscan_blocks shallow_hash_block with
| Some hash_normalized ->
hash_normalized
| None ->
HNoscan.add sharer.noscan_blocks shallow_hash_block shallow_hash_block ;
shallow_hash_block )
else if Int.equal tag Obj.lazy_tag then
(*
For now MaximumSharing is used before marshalling.
It makes little sense to marshal lazy values.
Because lazy blocks are normal scannable blocks, this special case could be safely removed.
*)
raise MaximumSharingLazyValue
else
(*
This is where we could win by mutating the value directly.
Instead we need to use a hashtbl using a shallow hash (for termination), which adds a
multiplicative factor to the running time.
*)
match HPhysEq.find_opt sharer.visited_blocks shallow_hash_block with
| Some (Normalized hash_normalized) ->
(* The block has already been visited, we can reuse the result. *)
hash_normalized
| Some Visiting ->
(*
The block is being visited, which means we have a cycle.
*)
raise MaximumSharingCyclicValue
| None ->
HPhysEq.add sharer.visited_blocks shallow_hash_block Visiting ;
let hash_normalized =
if Int.equal tag Obj.forward_tag then (
assert (not sharer.fail_on_forward) ;
(*
Forward_tag is an intermediate block resulting from the evaluating of a lazy.
As an optimization, let's replace it directly with the normalization of the result
as if this intermediate block didn't exist.
This remains untested for now (hence the assertion above).
Not obvious to test as optimizations or the GC can already do the substitution.
*)
hash_and_normalize_obj sharer (Obj.field block 0) )
else (
(* For regular blocks, normalize each field then use a shallow comparison. *)
assert ((not sharer.fail_on_objects) || not (Int.equal tag Obj.object_tag)) ;
let hash_shallow_normalized =
let size = Obj.size block in
hash_and_normalize_block_fields sharer block block size 0
(Hashing.alloc_of_block ~tag ~size)
in
match HNorm.find_opt sharer.hash_normalized hash_shallow_normalized with
| Some hash_normalized ->
hash_normalized
| None ->
HNorm.add sharer.hash_normalized hash_shallow_normalized hash_shallow_normalized ;
hash_shallow_normalized )
in
HPhysEq.replace sharer.visited_blocks shallow_hash_block (Normalized hash_normalized) ;
hash_normalized
and hash_and_normalize_block_fields sharer original_block new_block size field_i hash_state =
if field_i >= size then normalize_block sharer (Hashing.get_hash_value hash_state) new_block
if field_i >= size then (Hashing.get_hash_value hash_state, new_block)
else
let field_v = Obj.field original_block field_i in
let field_hash, field_v' = hash_and_normalize_obj sharer field_v in
@ -142,10 +208,6 @@ end = struct
(field_i + 1) hash_state
and hash_and_normalize_no_scan_block sharer block =
normalize_block sharer (Hashing.of_no_scan_block block) block
(**
Returns a value structurally equal but with potentially more sharing.
Potentially unsafe if used on mutable values that are modified later.

@ -8,8 +8,8 @@
open! IStd
(**
Current implementation will stack overflow on deep (TODO: a tailrec version)
or circular values (much harder to detect sharing, also not needed for now).
Current implementation will stack overflow on deep values (TODO: a tailrec version)
and throw on circular values.
*)
module Sharer : sig

@ -11,11 +11,13 @@ open OUnit2
let inputs =
let a = `A 'a' in
let b = Array.create ~len:10_000 a in
let c = Array.create ~len:100 b in
[ ("unit", Obj.repr ())
; ("same representation", Obj.repr ([42], [|42; 0|]))
; ("10K times the same element", Obj.repr b)
; ("100 times 10K times the same element", Obj.repr c) ]
let c = Array.create ~len:1_000 b in
let d = Array.create ~len:1_000 c in
[ ("unit", Obj.repr (), `PhysEqual)
; ("same representation", Obj.repr ([42], [|42; 0|]), `Marshal_MustBeBetter)
; ("10K times the same element", Obj.repr b, `PhysEqual)
; ("1K times 10K times the same element", Obj.repr c, `PhysEqual)
; ("1K times 1K times 10K times the same element", Obj.repr d, `PhysEqual) ]
let tests =
@ -23,10 +25,17 @@ let tests =
let sharer = MaximumSharing.Sharer.create () in
MaximumSharing.Sharer.normalize_value sharer input
in
let test_one input _ =
let test_one input checks _ =
(* Save this now, in case `MaximumSharing` mutates the [input], even though it shouldn't *)
let serialized_input_with_sharing = Marshal.to_string input [] in
let serialized_input_no_sharing = Marshal.to_string input [Marshal.No_sharing] in
let serialized_input_no_sharing =
match checks with
| `PhysEqual ->
"UNUSED"
| `Marshal_MustBeBetter ->
(* OOMs for big or cyclic values *)
Marshal.to_string input [Marshal.No_sharing]
in
let reachable_words_input = Obj.reachable_words input in
let normalized = normalize input in
(*
@ -46,15 +55,25 @@ let tests =
assert_bool "less reachable words" (reachable_words_normalized <= reachable_words_input) ;
let eq =
(* Cannot use [assert_equal] because it doesn't shortcut physical equalities *)
Polymorphic_compare.equal input normalized
match checks with
| `PhysEqual ->
phys_equal input normalized
| `Marshal_MustBeBetter ->
Polymorphic_compare.equal input normalized
in
assert_bool "equal" eq ;
(*
match checks with
| `PhysEqual ->
()
| `Marshal_MustBeBetter ->
assert_bool "strictly less reachable words"
(reachable_words_normalized < reachable_words_input) ;
(*
In case structural equality and marshalling have slightly different semantics,
let's also make sure the serialized versions are indistinguishable
*)
let serialized_normalized_no_sharing = Marshal.to_string normalized [Marshal.No_sharing] in
assert_equal serialized_input_no_sharing serialized_normalized_no_sharing
let serialized_normalized_no_sharing = Marshal.to_string normalized [Marshal.No_sharing] in
assert_equal serialized_input_no_sharing serialized_normalized_no_sharing
in
let tests_ = List.map inputs ~f:(fun (name, input) -> name >:: test_one input) in
let tests_ = List.map inputs ~f:(fun (name, input, checks) -> name >:: test_one input checks) in
"MaximumSharing_tests" >::: tests_

Loading…
Cancel
Save