You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

424 lines
18 KiB

import getpass
import inspect
import os
import re
import sys
import tempfile
from os.path import abspath, dirname
from typing import Any, Callable, Dict, Optional, Set, Type, TYPE_CHECKING, Union
import torch
# to configure logging for dynamo, aot, and inductor
# use the following API in the torch._logging module
# torch._logging.set_logs(dynamo=<level>, aot=<level>, inductor<level>)
# or use the environment variable TORCH_LOGS="dynamo,aot,inductor" (use a prefix + to indicate higher verbosity)
# see this design doc for more detailed info
# Design doc: https://docs.google.com/document/d/1ZRfTWKa8eaPq1AxaiHrq4ASTPouzzlPiuquSBEJYwS8/edit#
# the name of a file to write the logs to
# [@compile_ignored: debug]
log_file_name: Optional[str] = None
# [@compile_ignored: debug] Verbose will print full stack traces on warnings and errors
verbose = os.environ.get("TORCHDYNAMO_VERBOSE", "0") == "1"
# [@compile_ignored: runtime_behaviour] verify the correctness of optimized backend
verify_correctness = False
# need this many ops to create an FX graph
minimum_call_count = 1
# turn on/off DCE pass
dead_code_elimination = True
# disable (for a function) when cache reaches this size
# controls the maximum number of cache entries with a guard on same ID_MATCH'd
# object. It also controls the maximum size of cache entries if they don't have
# any ID_MATCH'd guards.
# [@compile_ignored: runtime_behaviour]
cache_size_limit = 8
# [@compile_ignored: runtime_behaviour] controls the maximum number of entries for a code object.
accumulated_cache_size_limit = 64
# whether or not to specialize on int inputs. This only has an effect with
# dynamic_shapes; when dynamic_shapes is False, we ALWAYS specialize on int
# inputs. Note that assume_static_by_default will also cause ints to get
# specialized, so this is mostly useful for export, where we want inputs
# to be dynamic, but accesses to ints should NOT get promoted into inputs.
specialize_int = False
# legacy config, does nothing now!
dynamic_shapes = True
use_lazy_graph_module = (
os.environ.get("TORCH_COMPILE_USE_LAZY_GRAPH_MODULE", "1") == "1"
)
# This is a temporarily flag, which changes the behavior of dynamic_shapes=True.
# When assume_static_by_default is True, we only allocate symbols for shapes marked dynamic via mark_dynamic.
# NOTE - this flag can be removed once we can run dynamic_shapes=False w/ the mark_dynamic API
# see [Note - on the state of mark_dynamic]
assume_static_by_default = True
# This flag changes how dynamic_shapes=True works, and is meant to be used in conjunction
# with assume_static_by_default=True.
# With this flag enabled, we always compile a frame as fully static for the first time, and, if we fail
# any guards due to wobbles in shape, we recompile with *all* the wobbled shapes as being marked dynamic.
automatic_dynamic_shapes = True
# This flag changes how the shapes of parameters are treated.
# If this flag is set to True, then the shapes of torch.nn.Parameter as well as of torch.Tensor are attempted to be dynamic
# If this flag is set to False, then the shapes of torch.nn.Parameter are assumed to be static,
# while the shapes of torch.Tensor are assumed to be dynamic.
force_parameter_static_shapes = True
# This flag ensures that the shapes of a nn module are always assumed to be static
# If the flag is set to True, then the shapes of a nn.module are assumed to be static
# If the flag is set to False, then the shapes of a nn.module can be dynamic
force_nn_module_property_static_shapes = True
# Typically, if you mark_dynamic a dimension, we will error if the dimension
# actually ended up getting specialized. This knob changes the behavior so
# that we don't error at all. This is helpful for our CI where I'm using a
# heuristic to mark batch dimensions as dynamic and the heuristic may get it
# wrong.
allow_ignore_mark_dynamic = False
# Set this to False to assume nn.Modules() contents are immutable (similar assumption as freezing)
guard_nn_modules = False
# Uses CPython internal dictionary tags to detect mutation. There is some
# overlap between guard_nn_modules_using_dict_tags and guard_nn_modules flag.
# guard_nn_modules unspecializes the nn module instance and adds guard for each
# relevant member of the nn modules. On the other hand,
# guard_nn_modules_using_dict_tags specializes on each nn module instance but
# uses low overhead dict version matching to detect mutations, obviating the
# need to guard on members of the nn modules. With
# guard_nn_modules_using_dict_tags, the guard_nn_modules is not really required
# but kept around for debugging and discussing unspecializing nn module
# variables.
# TODO(janimesh, voz): Remove both of these flags (or atleast guard_nn_modules)
# once we have reached stability for the guard_nn_modules_using_dict_tags.
guard_nn_modules_using_dict_tags = True
# This feature doesn't really work. We offer this flag for experimental
# purposes / if you want to help us build out support.
#
# torchdynamo has very limited support for tensor subclasses that implement
# __torch_function__. Our current support is limited to tensor subclasses
# that DO NOT store metadata on the tensor (in general, dynamo does not
# support Python code that stores extra attributes on tensors at present).
# If your tensor subclass purely changes function call behavior via
# __torch_function__, you can allow torchdynamo to trace into it by
# adding it to traceable_tensor_subclasses. We don't do any safety checks,
# so it is up to you to ensure that your subclass is well behaved. See also
# https://github.com/pytorch/torchdynamo/issues/1948
#
# We do NOT currently support __torch_dispatch__. The implementation is
# currently buggy, the main show stopper for nontrivial use is
# https://github.com/pytorch/torchdynamo/issues/1952
traceable_tensor_subclasses: Set[Type[Any]] = set()
# Suppress errors in torch._dynamo.optimize, instead forcing a fallback to eager.
# This is a good way to get your model to work one way or another, but you may
# lose optimization opportunities this way. Devs, if your benchmark model is failing
# this way, you should figure out why instead of suppressing it.
suppress_errors = bool(os.environ.get("TORCHDYNAMO_SUPPRESS_ERRORS", False))
# Record and write an execution record of the current frame to a file
# if an exception is encountered
# @compile_ignored[debug]
replay_record_enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
# Rewrite assert statement in python with torch._assert
rewrite_assert_with_torch_assert = True
# Disable dynamo
disable = os.environ.get("TORCH_COMPILE_DISABLE", False)
# [@compile_ignored: runtime_behaviour] Get a cprofile trace of Dynamo
cprofile = os.environ.get("TORCH_COMPILE_CPROFILE", False)
# legacy config, does nothing now!
skipfiles_inline_module_allowlist: Dict[Any, Any] = {}
# If a string representing a PyTorch module is in this ignorelist,
# the `allowed_functions.is_allowed` function will not consider it
# when creating a list of PyTorch functions that will appear in
# FX IR.
allowed_functions_module_string_ignorelist = {
"torch.distributions",
"torch.testing",
"torch._refs",
"torch._prims",
"torch._decomp",
}
# Debug Flag to try minifier at different stages. Possible values are {None, "aot", "dynamo"}
# None - Minifier is switched off
# dynamo - Runs minifier on the TorchDynamo produced graphs, if compilation fails
# aot - Runs minifier on the Aot Autograd produced graphs, if compilation fails
# [@compile_ignored: debug]
repro_after = os.environ.get("TORCHDYNAMO_REPRO_AFTER", None)
# Compiler compilation debug info
# 1: Dumps the original graph out to repro.py if compilation fails
# 2: Dumps a minifier_launcher.py if compilation fails.
# 3: Always dumps a minifier_launcher.py. Good for segfaults.
# 4: Dumps a minifier_launcher.py if the accuracy fails.
# [@compile_ignored: debug]
repro_level = int(os.environ.get("TORCHDYNAMO_REPRO_LEVEL", 2))
# By default, we try to detect accuracy failure by running both forward
# and backward of a torchdynamo produced graph (if you are using repro_after
# 'dynamo'). This setting forces us to only test the forward graph and
# not the backward graph. This can be helpful if you're trying to debug
# an inference only problem, but the minifier seems to be choking on the
# backwards step
# TODO: Detect this situation automatically so the user doesn't need
# to manually configure this
# [@compile_ignored: debug]
repro_forward_only = os.environ.get("TORCHDYNAMO_REPRO_FORWARD_ONLY") == "1"
# The tolerance we should use when testing if a compiled graph
# has diverged so that we should treat it as an accuracy failure
# [@compile_ignored: debug]
repro_tolerance = 1e-3
# If True, when testing if two models are the same, we will test them against
# a third fp64 reference and only report a problem if the RMSE relative to the
# fp64 is greater. However, this will use more memory; you may disable this
# if memory usage is too high.
# [@compile_ignored: runtime_behaviour]
same_two_models_use_fp64 = True
# Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type.
# When this flag is set to False, we introduce a graph break instead of capturing.
# This requires dynamic_shapes to be True.
capture_scalar_outputs = False
# Not all backends support operators that have dynamic output shape (e.g.,
# nonzero, unique). When this flag is set to False, we introduce a graph
# break instead of capturing. This requires dynamic_shapes to be True.
# If you set this to True, you probably also want capture_scalar_outputs
# (these are separated for historical reasons).
capture_dynamic_output_shape_ops = False
# By default, dynamo will treat all ints as backed SymInts, which means (1) it
# will wait to see the int change over multiple runs before generalizing and
# (2) it will still always 0/1 specialize an int. When true, this knob
# forces dynamo to treat _length_per_key and _offset_per_key on
# KeyedJaggedTensor from torchrec as size-like unbacked SymInts, so that
# they (1) generalize immediately and (2) unsoundly never compare equal to
# 0/1. This is not on by default as AOTAutograd/Inductor cannot currently
# compile this code; however, this can be useful for export.
force_unspec_int_unbacked_size_like_on_torchrec_kjt = False
# Should almost always be true in prod. This relaxes the requirement that cond's true_fn and
# false_fn produces code with identical guards.
enforce_cond_guards_match = True
# Specify how to optimize a compiiled DDP module. The flag accepts a bollean
# value or a string. There are 4 modes.
# 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically
# split model graph into pieces to match DDP bucket sizes to allow DDP
# comm/compute overlap.
# 2. "python_reducer" (experimental): this optimization requires the usage
# of compiled_autograd. With "python_reducer", DDP will disable the C++ reducer
# and use the Python reducer to allow compiled_autograd to trace the
# communication and allow comm/compute overlap without graph-breaks.
# 3. "python_reducer_without_compiled_forward" (experimental): this mode is
# similar to "python_reducer". One should only use this optimization mode
# when compiled_autograd is used but the DDP module is not compiled.
# 4. "no_optimization" (or False): Dynamo won't split the model graph, nor
# will Python reducer be used. With this mode, there will be no graph-breaks
# and the original DDP C++ reducer will be used. There will no comm/compute
# overlap. This mode CANNOT be used with compiled_autograd.
# Note that to avoid breaking the existing usage, mode 1 and mode 4 can be
# specified with a boolean value. True is using ddp_optimizer and False is
# no optimization.
optimize_ddp: Union[bool, str] = True
_ddp_optimization_mode = [
"ddp_optimizer",
"python_reducer", # experimental mode
"python_reducer_without_compiled_forward", # experimental mode
"no_optimization",
]
def _get_optimize_ddp_mode():
m = sys.modules[__name__]
if isinstance(m.optimize_ddp, bool):
if m.optimize_ddp:
mode = "ddp_optimizer"
else:
mode = "no_optimization"
elif isinstance(m.optimize_ddp, str):
mode = m.optimize_ddp
else:
raise ValueError(f"Invalid type, {type(optimize_ddp)=}")
assert mode in m._ddp_optimization_mode, f"Invalid mode {mode=}"
return mode
# If True, delays DDPOptimizer submodule compilation to 1st run of the model,
# so that real tensor strides are used in all submodules
# (instead of using FakeTensor strides which can differ from real tensor strides and causes error in some cases).
# This feature is not hardened yet and it's known to cause issues to some models, so False by default.
optimize_ddp_lazy_compile = False
# Whether to skip guarding on FSDP-managed modules
skip_fsdp_guards = True
# Make dynamo skip guarding on hooks on nn modules
# Note: unsafe: if your model actually has hooks and you remove them, or doesn't and you add them,
# dynamo will not notice and will execute whichever version you first compiled.
skip_nnmodule_hook_guards = True
# If True, raises exception if TorchDynamo is called with a context manager
raise_on_ctx_manager_usage = True
# If True, raise when aot autograd is unsafe to use
raise_on_unsafe_aot_autograd = False
# If true, error if you torch.jit.trace over a dynamo-optimized function.
# If false, silently suppress dynamo
error_on_nested_jit_trace = True
# If true, error with a better message if we symbolically trace over a
# dynamo-optimized function. If false, silently suppress dynamo.
error_on_nested_fx_trace = True
# Disables graph breaking on rnn. YMMV with backends.
allow_rnn = False
# If true, error if we try to compile a function that has
# been seen before.
# [@compile_ignored: runtime_behaviour]
error_on_recompile = False
# [@compile_ignored: debug] Whether to report any guard failures (deprecated: does not do anything)
report_guard_failures = True
# [@compile_ignored: debug] root folder of the project
base_dir = dirname(dirname(dirname(abspath(__file__))))
# Trace through NumPy or graphbreak
trace_numpy = True
# Trace through torch.distributed code
trace_distributed = False
# Default NumPy dtypes when tracing with torch.compile
# We default to 64bits. For efficiency, one may want to change these to float32
numpy_default_float = "float64"
numpy_default_complex = "complex128"
numpy_default_int = "int64"
# use numpy's PRNG if True, pytorch otherwise
use_numpy_random_stream = False
def is_fbcode():
return not hasattr(torch.version, "git_version")
def default_debug_dir_root():
# [@compile_ignored: debug]
DEBUG_DIR_VAR_NAME = "TORCH_COMPILE_DEBUG_DIR"
if DEBUG_DIR_VAR_NAME in os.environ:
return os.path.join(os.environ[DEBUG_DIR_VAR_NAME], "torch_compile_debug")
elif is_fbcode():
return os.path.join(
tempfile.gettempdir(), getpass.getuser(), "torch_compile_debug"
)
else:
return os.path.join(os.getcwd(), "torch_compile_debug")
# [@compile_ignored: debug]
debug_dir_root = default_debug_dir_root()
# [@compile_ignored: debug]
_save_config_ignore = {
"repro_after",
"repro_level",
# workaround: "cannot pickle PyCapsule"
"constant_functions",
# workaround: "cannot pickle module"
"skipfiles_inline_module_allowlist",
}
# for backend="cudagraphs", mutations on input be sent to the cudagraph backend
# or replayed in aot_autograd epilogue. default is False because mutation on inputs
# can prevent cudagraphing.
cudagraph_backend_keep_input_mutation = False
# When True, only ops that have the torch.Tag.pt2_compliant tag
# will be allowed into the graph; all other ops will be disallowed
# and will fall back to eager-mode PyTorch. Useful to ensure
# correctness of custom ops.
only_allow_pt2_compliant_ops = False
capture_autograd_function = True
# enable/disable dynamo tracing for `torch.func` transforms
capture_func_transforms = False
# enable/disable user-defined triton kernel optimizations
optimize_user_defined_triton_kernels = True
# If to log Dynamo compilation metrics into log files (for OSS) and Scuba tables (for fbcode).
log_compilation_metrics = True
# A set of logging functions which will be reordered to the end of graph breaks,
# allowing dynamo to construct larget graph. Note that there are some
# limitations to this, such as how it does not correctly print objects that were
# mutated after the print statement.
reorderable_logging_functions: Set[Callable[[Any], None]] = set()
# simulates what would happen if we didn't have support for BUILD_SET opcode,
# used for testing
inject_BUILD_SET_unimplemented_TESTING_ONLY = False
_autograd_backward_strict_mode_banned_ops = [
"stride",
"requires_grad",
"storage_offset",
"layout",
"data",
]
_autograd_backward_strict_mode_banned_ops.extend(
[name for name, _ in inspect.getmembers(torch.Tensor) if re.match(r"^is_.*", name)]
)
# Enables caching of dispatches to fake tensors.
fake_tensor_cache_enabled = (
os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE", "1") == "1"
)
# Enables cross checking between the fake tensor cache and dispatch.
fake_tensor_cache_crosscheck_enabled = (
os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE_CROSSCHECK", "0") == "1"
)
# support `context_fn` in torch.utils.checkpoint.checkpoint API under torch.compile().
# WARNING: this is an experimental flag and is subject to change.
_experimental_support_context_fn_in_torch_utils_checkpoint = False
if TYPE_CHECKING:
from torch.utils._config_typing import * # noqa: F401, F403
def _make_closure_patcher(**changes):
...
from torch.utils._config_module import install_config_module
install_config_module(sys.modules[__name__])