You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
915 lines
34 KiB
915 lines
34 KiB
5 months ago
|
r"""This package adds support for device memory management implemented in CUDA."""
|
||
|
|
||
|
import collections
|
||
|
import contextlib
|
||
|
import ctypes
|
||
|
import pickle
|
||
|
import sys
|
||
|
import warnings
|
||
|
from inspect import signature
|
||
|
|
||
|
from typing import Any, Dict, Optional, Tuple, Union
|
||
|
|
||
|
import torch
|
||
|
from torch import _C
|
||
|
|
||
|
from torch.types import Device
|
||
|
from .._utils import _dummy_type
|
||
|
from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized
|
||
|
|
||
|
from ._memory_viz import memory as _memory, segments as _segments
|
||
|
|
||
|
__all__ = [
|
||
|
"caching_allocator_alloc",
|
||
|
"caching_allocator_delete",
|
||
|
"set_per_process_memory_fraction",
|
||
|
"empty_cache",
|
||
|
"memory_stats",
|
||
|
"memory_stats_as_nested_dict",
|
||
|
"reset_accumulated_memory_stats",
|
||
|
"reset_peak_memory_stats",
|
||
|
"reset_max_memory_allocated",
|
||
|
"reset_max_memory_cached",
|
||
|
"memory_allocated",
|
||
|
"max_memory_allocated",
|
||
|
"memory_reserved",
|
||
|
"max_memory_reserved",
|
||
|
"memory_cached",
|
||
|
"max_memory_cached",
|
||
|
"memory_snapshot",
|
||
|
"memory_summary",
|
||
|
"list_gpu_processes",
|
||
|
"mem_get_info",
|
||
|
"get_allocator_backend",
|
||
|
"CUDAPluggableAllocator",
|
||
|
"change_current_allocator",
|
||
|
]
|
||
|
|
||
|
|
||
|
if not hasattr(torch._C, "_cuda_CUDAAllocator"):
|
||
|
# Define dummy base classes
|
||
|
torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator")
|
||
|
|
||
|
|
||
|
def _host_allocator():
|
||
|
_lazy_init()
|
||
|
return torch._C._cuda_cudaHostAllocator()
|
||
|
|
||
|
|
||
|
@contextlib.contextmanager
|
||
|
def _free_mutex():
|
||
|
torch._C._cuda_lock_mutex()
|
||
|
try:
|
||
|
yield
|
||
|
finally:
|
||
|
torch._C._cuda_unlock_mutex()
|
||
|
|
||
|
|
||
|
def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
|
||
|
r"""Perform a memory allocation using the CUDA memory allocator.
|
||
|
|
||
|
Memory is allocated for a given device and a stream, this
|
||
|
function is intended to be used for interoperability with other
|
||
|
frameworks. Allocated memory is released through
|
||
|
:func:`~torch.cuda.caching_allocator_delete`.
|
||
|
|
||
|
Args:
|
||
|
size (int): number of bytes to be allocated.
|
||
|
device (torch.device or int, optional): selected device. If it is
|
||
|
``None`` the default CUDA device is used.
|
||
|
stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
|
||
|
the default stream for the selected device is used.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
if device is None:
|
||
|
device = torch.cuda.current_device()
|
||
|
device = _get_device_index(device)
|
||
|
if stream is None:
|
||
|
stream = torch.cuda.current_stream(device)
|
||
|
if isinstance(stream, torch.cuda.streams.Stream):
|
||
|
stream = stream.cuda_stream
|
||
|
if not isinstance(stream, int):
|
||
|
raise TypeError(
|
||
|
"Invalid type for stream argument, must be "
|
||
|
"`torch.cuda.Stream` or `int` representing a pointer "
|
||
|
"to a existing stream"
|
||
|
)
|
||
|
with torch.cuda.device(device):
|
||
|
return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
|
||
|
|
||
|
|
||
|
def caching_allocator_delete(mem_ptr):
|
||
|
r"""Delete memory allocated using the CUDA memory allocator.
|
||
|
|
||
|
Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
|
||
|
is freed here. The associated device and stream are tracked inside
|
||
|
the allocator.
|
||
|
|
||
|
Args:
|
||
|
mem_ptr (int): memory address to be freed by the allocator.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
|
||
|
|
||
|
|
||
|
def set_per_process_memory_fraction(
|
||
|
fraction, device: Union[Device, int] = None
|
||
|
) -> None:
|
||
|
r"""Set memory fraction for a process.
|
||
|
|
||
|
The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
|
||
|
The allowed value equals the total visible memory multiplied fraction.
|
||
|
If trying to allocate more than the allowed value in a process, will raise an out of
|
||
|
memory error in allocator.
|
||
|
|
||
|
Args:
|
||
|
fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
|
||
|
device (torch.device or int, optional): selected device. If it is
|
||
|
``None`` the default CUDA device is used.
|
||
|
.. note::
|
||
|
In general, the total available free memory is less than the total capacity.
|
||
|
"""
|
||
|
_lazy_init()
|
||
|
if device is None:
|
||
|
device = torch.cuda.current_device()
|
||
|
device = _get_device_index(device)
|
||
|
if not isinstance(fraction, float):
|
||
|
raise TypeError("Invalid type for fraction argument, must be `float`")
|
||
|
if fraction < 0 or fraction > 1:
|
||
|
raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1")
|
||
|
|
||
|
torch._C._cuda_setMemoryFraction(fraction, device)
|
||
|
|
||
|
|
||
|
def empty_cache() -> None:
|
||
|
r"""Release all unoccupied cached memory currently held by the caching
|
||
|
allocator so that those can be used in other GPU application and visible in
|
||
|
`nvidia-smi`.
|
||
|
|
||
|
.. note::
|
||
|
:func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
|
||
|
memory available for PyTorch. However, it may help reduce fragmentation
|
||
|
of GPU memory in certain cases. See :ref:`cuda-memory-management` for
|
||
|
more details about GPU memory management.
|
||
|
"""
|
||
|
if is_initialized():
|
||
|
torch._C._cuda_emptyCache()
|
||
|
|
||
|
|
||
|
def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
|
||
|
r"""Return a dictionary of CUDA memory allocator statistics for a given device.
|
||
|
|
||
|
The return value of this function is a dictionary of statistics, each of
|
||
|
which is a non-negative integer.
|
||
|
|
||
|
Core statistics:
|
||
|
|
||
|
- ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
number of allocation requests received by the memory allocator.
|
||
|
- ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
amount of allocated memory.
|
||
|
- ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
number of reserved segments from ``cudaMalloc()``.
|
||
|
- ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
amount of reserved memory.
|
||
|
- ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
number of active memory blocks.
|
||
|
- ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
amount of active memory.
|
||
|
- ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
number of inactive, non-releasable memory blocks.
|
||
|
- ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
amount of inactive, non-releasable memory.
|
||
|
|
||
|
For these core statistics, values are broken down as follows.
|
||
|
|
||
|
Pool type:
|
||
|
|
||
|
- ``all``: combined statistics across all memory pools.
|
||
|
- ``large_pool``: statistics for the large allocation pool
|
||
|
(as of October 2019, for size >= 1MB allocations).
|
||
|
- ``small_pool``: statistics for the small allocation pool
|
||
|
(as of October 2019, for size < 1MB allocations).
|
||
|
|
||
|
Metric type:
|
||
|
|
||
|
- ``current``: current value of this metric.
|
||
|
- ``peak``: maximum value of this metric.
|
||
|
- ``allocated``: historical total increase in this metric.
|
||
|
- ``freed``: historical total decrease in this metric.
|
||
|
|
||
|
In addition to the core statistics, we also provide some simple event
|
||
|
counters:
|
||
|
|
||
|
- ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
|
||
|
result in a cache flush and retry.
|
||
|
- ``"num_ooms"``: number of out-of-memory errors thrown.
|
||
|
|
||
|
The caching allocator can be configured via ENV to not split blocks larger than a
|
||
|
defined size (see Memory Management section of the Cuda Semantics documentation).
|
||
|
This helps avoid memory fragmentation but may have a performance
|
||
|
penalty. Additional outputs to assist with tuning and evaluating impact:
|
||
|
|
||
|
- ``"max_split_size"``: blocks above this size will not be split.
|
||
|
- ``"oversize_allocations.{current,peak,allocated,freed}"``:
|
||
|
number of over-size allocation requests received by the memory allocator.
|
||
|
- ``"oversize_segments.{current,peak,allocated,freed}"``:
|
||
|
number of over-size reserved segments from ``cudaMalloc()``.
|
||
|
|
||
|
The caching allocator can be configured via ENV to round memory allocations in order
|
||
|
to reduce fragmentation. Sometimes the overhead from rounding can be higher than
|
||
|
the fragmentation it helps reduce. The following stat can be used to check if
|
||
|
rounding adds too much overhead:
|
||
|
|
||
|
- ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
|
||
|
memory requested by client code, compare this with allocated_bytes to check if
|
||
|
allocation rounding adds too much overhead.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistics for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
|
||
|
.. note::
|
||
|
With :ref:`backend:cudaMallocAsync<cuda-memory-envvars>`, some stats are not
|
||
|
meaningful, and are always reported as zero.
|
||
|
"""
|
||
|
result = []
|
||
|
|
||
|
def _recurse_add_to_result(prefix, obj):
|
||
|
if isinstance(obj, dict):
|
||
|
if len(prefix) > 0:
|
||
|
prefix += "."
|
||
|
for k, v in obj.items():
|
||
|
_recurse_add_to_result(prefix + k, v)
|
||
|
else:
|
||
|
result.append((prefix, obj))
|
||
|
|
||
|
stats = memory_stats_as_nested_dict(device=device)
|
||
|
_recurse_add_to_result("", stats)
|
||
|
result.sort()
|
||
|
|
||
|
return collections.OrderedDict(result)
|
||
|
|
||
|
|
||
|
def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
|
||
|
r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
|
||
|
if not is_initialized():
|
||
|
return {}
|
||
|
device = _get_device_index(device, optional=True)
|
||
|
return torch._C._cuda_memoryStats(device)
|
||
|
|
||
|
|
||
|
def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
|
||
|
r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator.
|
||
|
|
||
|
See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
|
||
|
the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
|
||
|
`"num_alloc_retries"` and `"num_ooms"`.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
device = _get_device_index(device, optional=True)
|
||
|
return torch._C._cuda_resetAccumulatedMemoryStats(device)
|
||
|
|
||
|
|
||
|
def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
|
||
|
r"""Reset the "peak" stats tracked by the CUDA memory allocator.
|
||
|
|
||
|
See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
|
||
|
`"peak"` key in each individual stat dict.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
device = _get_device_index(device, optional=True)
|
||
|
return torch._C._cuda_resetPeakMemoryStats(device)
|
||
|
|
||
|
|
||
|
def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
|
||
|
r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
|
||
|
|
||
|
See :func:`~torch.cuda.max_memory_allocated` for details.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. warning::
|
||
|
This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
|
||
|
/all/ peak memory stats.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
warnings.warn(
|
||
|
"torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
|
||
|
"which resets /all/ peak memory stats.",
|
||
|
FutureWarning,
|
||
|
)
|
||
|
return reset_peak_memory_stats(device=device)
|
||
|
|
||
|
|
||
|
def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
|
||
|
r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
|
||
|
|
||
|
See :func:`~torch.cuda.max_memory_cached` for details.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. warning::
|
||
|
This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
|
||
|
/all/ peak memory stats.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
warnings.warn(
|
||
|
"torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
|
||
|
"which resets /all/ peak memory stats.",
|
||
|
FutureWarning,
|
||
|
)
|
||
|
return reset_peak_memory_stats(device=device)
|
||
|
|
||
|
|
||
|
def memory_allocated(device: Union[Device, int] = None) -> int:
|
||
|
r"""Return the current GPU memory occupied by tensors in bytes for a given device.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
This is likely less than the amount shown in `nvidia-smi` since some
|
||
|
unused memory can be held by the caching allocator and some context
|
||
|
needs to be created on GPU. See :ref:`cuda-memory-management` for more
|
||
|
details about GPU memory management.
|
||
|
"""
|
||
|
return memory_stats(device=device).get("allocated_bytes.all.current", 0)
|
||
|
|
||
|
|
||
|
def max_memory_allocated(device: Union[Device, int] = None) -> int:
|
||
|
r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
|
||
|
|
||
|
By default, this returns the peak allocated memory since the beginning of
|
||
|
this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
|
||
|
reset the starting point in tracking this metric. For example, these two
|
||
|
functions can measure the peak allocated memory usage of each iteration in a
|
||
|
training loop.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
|
||
|
|
||
|
|
||
|
def memory_reserved(device: Union[Device, int] = None) -> int:
|
||
|
r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
return memory_stats(device=device).get("reserved_bytes.all.current", 0)
|
||
|
|
||
|
|
||
|
def max_memory_reserved(device: Union[Device, int] = None) -> int:
|
||
|
r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
|
||
|
|
||
|
By default, this returns the peak cached memory since the beginning of this
|
||
|
program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
|
||
|
the starting point in tracking this metric. For example, these two functions
|
||
|
can measure the peak cached memory amount of each iteration in a training
|
||
|
loop.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
|
||
|
|
||
|
|
||
|
def memory_cached(device: Union[Device, int] = None) -> int:
|
||
|
r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
|
||
|
warnings.warn(
|
||
|
"torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
|
||
|
FutureWarning,
|
||
|
)
|
||
|
return memory_reserved(device=device)
|
||
|
|
||
|
|
||
|
def max_memory_cached(device: Union[Device, int] = None) -> int:
|
||
|
r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
|
||
|
warnings.warn(
|
||
|
"torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
|
||
|
FutureWarning,
|
||
|
)
|
||
|
return max_memory_reserved(device=device)
|
||
|
|
||
|
|
||
|
def memory_snapshot():
|
||
|
r"""Return a snapshot of the CUDA memory allocator state across all devices.
|
||
|
|
||
|
Interpreting the output of this function requires familiarity with the
|
||
|
memory allocator internals.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
return torch._C._cuda_memorySnapshot()["segments"]
|
||
|
|
||
|
|
||
|
def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
|
||
|
r"""Return a human-readable printout of the current memory allocator statistics for a given device.
|
||
|
|
||
|
This can be useful to display periodically during training, or when
|
||
|
handling out-of-memory exceptions.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
printout for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
abbreviated (bool, optional): whether to return an abbreviated summary
|
||
|
(default: False).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more details about GPU memory
|
||
|
management.
|
||
|
"""
|
||
|
device = _get_device_index(device, optional=True)
|
||
|
stats = memory_stats(device=device)
|
||
|
|
||
|
def _format_size(sz, pref_sz):
|
||
|
prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"]
|
||
|
prefix = prefixes[0]
|
||
|
for new_prefix in prefixes[1:]:
|
||
|
if pref_sz < 768 * 1024:
|
||
|
break
|
||
|
prefix = new_prefix
|
||
|
sz //= 1024
|
||
|
pref_sz /= 1024
|
||
|
return f"{sz:6d} {prefix}"
|
||
|
|
||
|
def _format_count(cnt, pref_cnt):
|
||
|
prefixes = [" ", "K", "M"]
|
||
|
prefix = prefixes[0]
|
||
|
for new_prefix in prefixes[1:]:
|
||
|
if pref_cnt < 750 * 1000:
|
||
|
break
|
||
|
prefix = new_prefix
|
||
|
cnt //= 1000
|
||
|
pref_cnt /= 1000
|
||
|
return f"{cnt:7d} {prefix} "
|
||
|
|
||
|
metrics_to_display = [
|
||
|
("allocated_bytes", "Allocated memory", _format_size),
|
||
|
("active_bytes", "Active memory", _format_size),
|
||
|
("requested_bytes", "Requested memory", _format_size),
|
||
|
("reserved_bytes", "GPU reserved memory", _format_size),
|
||
|
("inactive_split_bytes", "Non-releasable memory", _format_size),
|
||
|
("allocation", "Allocations", _format_count),
|
||
|
("active", "Active allocs", _format_count),
|
||
|
("segment", "GPU reserved segments", _format_count),
|
||
|
("inactive_split", "Non-releasable allocs", _format_count),
|
||
|
]
|
||
|
|
||
|
lines = []
|
||
|
lines.append("=" * 75)
|
||
|
lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
|
||
|
lines.append("-" * 75)
|
||
|
lines.append(
|
||
|
" {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} "
|
||
|
)
|
||
|
lines.append("=" * 75)
|
||
|
lines.append(
|
||
|
" Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed "
|
||
|
)
|
||
|
|
||
|
for metric_key, metric_name, formatter in metrics_to_display:
|
||
|
lines.append("-" * 75)
|
||
|
submetrics = [("all", metric_name)]
|
||
|
if not abbreviated:
|
||
|
submetrics.append(("large_pool", " from large pool"))
|
||
|
submetrics.append(("small_pool", " from small pool"))
|
||
|
|
||
|
current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
|
||
|
None,
|
||
|
None,
|
||
|
None,
|
||
|
None,
|
||
|
)
|
||
|
|
||
|
for submetric_key, submetric_name in submetrics:
|
||
|
prefix = metric_key + "." + submetric_key + "."
|
||
|
|
||
|
current = stats[prefix + "current"]
|
||
|
peak = stats[prefix + "peak"]
|
||
|
allocated = stats[prefix + "allocated"]
|
||
|
freed = stats[prefix + "freed"]
|
||
|
|
||
|
if current_prefval is None:
|
||
|
current_prefval = current
|
||
|
peak_prefval = peak
|
||
|
allocated_prefval = allocated
|
||
|
freed_prefval = freed
|
||
|
|
||
|
lines.append(
|
||
|
" {:<21} | {} | {} | {} | {} ".format(
|
||
|
submetric_name,
|
||
|
formatter(current, current_prefval),
|
||
|
formatter(peak, peak_prefval),
|
||
|
formatter(allocated, allocated_prefval),
|
||
|
formatter(freed, freed_prefval),
|
||
|
),
|
||
|
)
|
||
|
|
||
|
metrics_to_display = [
|
||
|
("oversize_allocations", "Oversize allocations", _format_count),
|
||
|
("oversize_segments", "Oversize GPU segments", _format_count),
|
||
|
]
|
||
|
|
||
|
for metric_key, metric_name, formatter in metrics_to_display:
|
||
|
lines.append("-" * 75)
|
||
|
|
||
|
prefix = metric_key + "."
|
||
|
|
||
|
current = stats[prefix + "current"]
|
||
|
peak = stats[prefix + "peak"]
|
||
|
allocated = stats[prefix + "allocated"]
|
||
|
freed = stats[prefix + "freed"]
|
||
|
|
||
|
lines.append(
|
||
|
" {:<21} | {} | {} | {} | {} ".format(
|
||
|
metric_name,
|
||
|
formatter(current, current),
|
||
|
formatter(peak, peak),
|
||
|
formatter(allocated, allocated),
|
||
|
formatter(freed, freed),
|
||
|
),
|
||
|
)
|
||
|
|
||
|
lines.append("=" * 75)
|
||
|
|
||
|
fmt_dict = {"_": "", "device": device}
|
||
|
for k, v in stats.items():
|
||
|
fmt_dict[k.replace(".", "-")] = v
|
||
|
return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
|
||
|
|
||
|
|
||
|
def list_gpu_processes(device: Union[Device, int] = None) -> str:
|
||
|
r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
|
||
|
|
||
|
This can be useful to display periodically during training, or when
|
||
|
handling out-of-memory exceptions.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
printout for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
"""
|
||
|
try:
|
||
|
import pynvml # type: ignore[import]
|
||
|
except ModuleNotFoundError:
|
||
|
return "pynvml module not found, please install pynvml"
|
||
|
from pynvml import NVMLError_DriverNotLoaded
|
||
|
|
||
|
try:
|
||
|
pynvml.nvmlInit()
|
||
|
except NVMLError_DriverNotLoaded:
|
||
|
return "cuda driver can't be loaded, is cuda enabled?"
|
||
|
device = _get_nvml_device_index(device)
|
||
|
handle = pynvml.nvmlDeviceGetHandleByIndex(device)
|
||
|
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
||
|
lines = []
|
||
|
lines.append(f"GPU:{device}")
|
||
|
if len(procs) == 0:
|
||
|
lines.append("no processes are running")
|
||
|
for p in procs:
|
||
|
mem = p.usedGpuMemory / (1024 * 1024)
|
||
|
lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
|
||
|
return "\n".join(lines)
|
||
|
|
||
|
|
||
|
def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
|
||
|
r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo.
|
||
|
|
||
|
Args:
|
||
|
device (torch.device or int, optional): selected device. Returns
|
||
|
statistic for the current device, given by :func:`~torch.cuda.current_device`,
|
||
|
if :attr:`device` is ``None`` (default).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for more
|
||
|
details about GPU memory management.
|
||
|
"""
|
||
|
if device is None:
|
||
|
device = torch.cuda.current_device()
|
||
|
device = _get_device_index(device)
|
||
|
return torch.cuda.cudart().cudaMemGetInfo(device)
|
||
|
|
||
|
|
||
|
def _record_memory_history_legacy(
|
||
|
enabled: bool,
|
||
|
record_context=True,
|
||
|
trace_alloc_max_entries=1,
|
||
|
trace_alloc_record_context=False,
|
||
|
device: Union[Device, int] = None,
|
||
|
record_context_cpp=False,
|
||
|
):
|
||
|
_C._cuda_record_memory_history_legacy(
|
||
|
enabled,
|
||
|
record_context,
|
||
|
trace_alloc_max_entries,
|
||
|
trace_alloc_record_context,
|
||
|
record_context_cpp,
|
||
|
)
|
||
|
|
||
|
|
||
|
def _record_memory_history(enabled="all", *args, **kwargs):
|
||
|
"""Enable recording of stack traces associated with memory
|
||
|
allocations, so you can tell what allocated any piece of memory in
|
||
|
:func:`torch.cuda.memory._snapshot()`.
|
||
|
|
||
|
In addition too keeping stack traces with each current allocation and free,
|
||
|
this will also enable recording of a history of all alloc/free events.
|
||
|
|
||
|
Use :func:`torch.cuda.memory._snapshot()` to retrieve this information,
|
||
|
and the tools in `_memory_viz.py` to visualize snapshots.
|
||
|
|
||
|
The Python trace collection is fast (2us per trace), so you may consider
|
||
|
enabling this on production jobs if you anticipate ever having to debug
|
||
|
memory issues.
|
||
|
|
||
|
C++ trace collection is also fast (~50ns/frame), which for many typical programs
|
||
|
works out to ~2us per trace, but can vary depending on stack depth.
|
||
|
|
||
|
Args:
|
||
|
enabled (Literal[None, "state", "all"], optional):
|
||
|
`None`, disable recording memory history.
|
||
|
`"state"`, keep information for currenly allocated memory.
|
||
|
`"all"`, additionally keep a history of all alloc/free calls.
|
||
|
Defaults to "all".
|
||
|
context (Literal[None, "state", "alloc", "all"], optional):
|
||
|
`None`, Do not record any tracebacks.
|
||
|
`"state"`, Record tracebacks for currently allocated memory.
|
||
|
`"alloc"`, additionally keep tracebacks for alloc calls.
|
||
|
`"all"`, additionally keep tracebacks for free calls.
|
||
|
Defaults to "all".
|
||
|
stacks (Literal["python", "all"], optional):
|
||
|
`"python"`, include Python, TorchScript, and inductor frames in tracebacks
|
||
|
`"all"`, additionally include C++ frames
|
||
|
Defaults to "all".
|
||
|
max_entries (int, optional): Keep a maximum of `max_entries`
|
||
|
alloc/free events in the recorded history recorded.
|
||
|
"""
|
||
|
if isinstance(enabled, bool):
|
||
|
return _record_memory_history_legacy(enabled, *args, **kwargs)
|
||
|
else:
|
||
|
return _record_memory_history_impl(enabled, *args, **kwargs)
|
||
|
|
||
|
|
||
|
def _record_memory_history_impl(
|
||
|
enabled: Optional[str] = "all",
|
||
|
context: Optional[str] = "all",
|
||
|
stacks: str = "all",
|
||
|
max_entries: int = sys.maxsize,
|
||
|
device: Union[Device, int] = None,
|
||
|
):
|
||
|
_C._cuda_record_memory_history(enabled, context, stacks, max_entries)
|
||
|
|
||
|
|
||
|
_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined]
|
||
|
|
||
|
|
||
|
def _snapshot(device: Union[Device, int] = None):
|
||
|
"""Save a snapshot of CUDA memory state at the time it was called.
|
||
|
|
||
|
The state is represented as a dictionary with the following structure.
|
||
|
|
||
|
.. code-block:: python
|
||
|
|
||
|
class Snapshot(TypedDict):
|
||
|
segments : List[Segment]
|
||
|
device_traces: List[List[TraceEntry]]
|
||
|
|
||
|
class Segment(TypedDict):
|
||
|
# Segments are memory returned from a cudaMalloc call.
|
||
|
# The size of reserved memory is the sum of all Segments.
|
||
|
# Segments are cached and reused for future allocations.
|
||
|
# If the reuse is smaller than the segment, the segment
|
||
|
# is split into more then one Block.
|
||
|
# empty_cache() frees Segments that are entirely inactive.
|
||
|
address: int
|
||
|
total_size: int # cudaMalloc'd size of segment
|
||
|
stream: int
|
||
|
segment_type: Literal['small', 'large'] # 'large' (>1MB)
|
||
|
allocated_size: int # size of memory in use
|
||
|
active_size: int # size of memory in use or in active_awaiting_free state
|
||
|
blocks : List[Block]
|
||
|
|
||
|
class Block(TypedDict):
|
||
|
# A piece of memory returned from the allocator, or
|
||
|
# current cached but inactive.
|
||
|
size: int
|
||
|
requested_size: int # size requested during malloc, may be smaller than
|
||
|
# size due to rounding
|
||
|
address: int
|
||
|
state: Literal['active_allocated', # used by a tensor
|
||
|
'active_awaiting_free', # waiting for another stream to finish using
|
||
|
# this, then it will become free
|
||
|
'inactive',] # free for reuse
|
||
|
frames: List[Frame] # stack trace from where the allocation occurred
|
||
|
|
||
|
class Frame(TypedDict):
|
||
|
filename: str
|
||
|
line: int
|
||
|
name: str
|
||
|
|
||
|
class TraceEntry(TypedDict):
|
||
|
# When `torch.cuda.memory._record_memory_history()` is enabled,
|
||
|
# the snapshot will contain TraceEntry objects that record each
|
||
|
# action the allocator took.
|
||
|
action: Literal[
|
||
|
'alloc' # memory allocated
|
||
|
'free_requested', # the allocated received a call to free memory
|
||
|
'free_completed', # the memory that was requested to be freed is now
|
||
|
# able to be used in future allocation calls
|
||
|
'segment_alloc', # the caching allocator ask cudaMalloc for more memory
|
||
|
# and added it as a segment in its cache
|
||
|
'segment_free', # the caching allocator called cudaFree to return memory
|
||
|
# to cuda possibly trying free up memory to
|
||
|
# allocate more segments or because empty_caches was called
|
||
|
'oom', # the allocator threw an OOM exception. 'size' is
|
||
|
# the requested number of bytes that did not succeed
|
||
|
'snapshot' # the allocator generated a memory snapshot
|
||
|
# useful to coorelate a previously taken
|
||
|
# snapshot with this trace
|
||
|
]
|
||
|
addr: int # not present for OOM
|
||
|
frames: List[Frame]
|
||
|
size: int
|
||
|
stream: int
|
||
|
device_free: int # only present for OOM, the amount of
|
||
|
# memory cuda still reports to be free
|
||
|
|
||
|
Returns:
|
||
|
The Snapshot dictionary object
|
||
|
"""
|
||
|
return _C._cuda_memorySnapshot()
|
||
|
|
||
|
|
||
|
def _dump_snapshot(filename="dump_snapshot.pickle"):
|
||
|
"""
|
||
|
Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
|
||
|
|
||
|
This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
|
||
|
|
||
|
Args:
|
||
|
filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
|
||
|
"""
|
||
|
s = _snapshot()
|
||
|
with open(filename, "wb") as f:
|
||
|
pickle.dump(s, f)
|
||
|
|
||
|
|
||
|
def _save_segment_usage(filename="output.svg", snapshot=None):
|
||
|
if snapshot is None:
|
||
|
snapshot = _snapshot()
|
||
|
with open(filename, "w") as f:
|
||
|
f.write(_segments(snapshot))
|
||
|
|
||
|
|
||
|
def _save_memory_usage(filename="output.svg", snapshot=None):
|
||
|
if snapshot is None:
|
||
|
snapshot = _snapshot()
|
||
|
with open(filename, "w") as f:
|
||
|
f.write(_memory(snapshot))
|
||
|
|
||
|
|
||
|
def _set_allocator_settings(env: str):
|
||
|
return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
|
||
|
|
||
|
|
||
|
def get_allocator_backend() -> str:
|
||
|
r"""Return a string describing the active allocator backend as set by
|
||
|
``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are
|
||
|
``native`` (PyTorch's native caching allocator) and `cudaMallocAsync``
|
||
|
(CUDA's built-in asynchronous allocator).
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for details on choosing the allocator backend.
|
||
|
"""
|
||
|
return torch._C._cuda_getAllocatorBackend()
|
||
|
|
||
|
|
||
|
class _CUDAAllocator:
|
||
|
r"""Wrapper over internal CUDA memory allocators."""
|
||
|
|
||
|
def __init__(self, allocator: torch._C._cuda_CUDAAllocator):
|
||
|
self._allocator = allocator
|
||
|
|
||
|
def allocator(self):
|
||
|
return self._allocator
|
||
|
|
||
|
|
||
|
class CUDAPluggableAllocator(_CUDAAllocator):
|
||
|
r"""CUDA memory allocator loaded from a so file."""
|
||
|
|
||
|
def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
|
||
|
r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes.
|
||
|
|
||
|
To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function.
|
||
|
|
||
|
Args:
|
||
|
path_to_so_file(str): Path in the filesystem to the `.so` file containing
|
||
|
the allocator functions
|
||
|
alloc_fn_name(str): Name of the function to perform the memory allocation
|
||
|
in the so file. The signature must be:
|
||
|
void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream);
|
||
|
free_fn_name(str): Name of the function to perform the memory release
|
||
|
in the so file. The signature must be:
|
||
|
void free_fn_name(void* ptr, size_t size, cudaStream_t stream);
|
||
|
|
||
|
.. warning::
|
||
|
This is currently supported only in unix OSs
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for details on creating and using a custom allocator
|
||
|
"""
|
||
|
allocator = ctypes.CDLL(path_to_so_file)
|
||
|
alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
|
||
|
free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
|
||
|
assert alloc_fn is not None
|
||
|
assert free_fn is not None
|
||
|
self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn)
|
||
|
|
||
|
|
||
|
def change_current_allocator(allocator: _CUDAAllocator) -> None:
|
||
|
r"""Change the currently used memory allocator to be the one provided.
|
||
|
|
||
|
If the current allocator has already been used/initialized, this function will error.
|
||
|
|
||
|
|
||
|
Args:
|
||
|
allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one.
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for details on creating and using a custom allocator
|
||
|
"""
|
||
|
torch._C._cuda_changeCurrentAllocator(allocator.allocator())
|
||
|
|
||
|
|
||
|
def _get_current_allocator() -> _CUDAAllocator:
|
||
|
r"""Return the allocator being currently used.
|
||
|
|
||
|
.. note::
|
||
|
See :ref:`cuda-memory-management` for details on creating and using a custom allocator
|
||
|
"""
|
||
|
return _CUDAAllocator(torch._C._cuda_getAllocator())
|