You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1179 lines
41 KiB
1179 lines
41 KiB
import bisect
|
|
import itertools
|
|
import math
|
|
|
|
from collections import defaultdict, namedtuple
|
|
from operator import attrgetter
|
|
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import torch
|
|
from torch.autograd import DeviceType
|
|
|
|
__all__ = [
|
|
"EventList",
|
|
"FormattedTimesMixin",
|
|
"Interval",
|
|
"Kernel",
|
|
"FunctionEvent",
|
|
"FunctionEventAvg",
|
|
"StringTable",
|
|
"MemRecordsAcc",
|
|
]
|
|
|
|
|
|
class EventList(list):
|
|
"""A list of Events (for pretty printing)."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
use_cuda = kwargs.pop("use_cuda", True)
|
|
use_device = kwargs.pop("use_device", None)
|
|
profile_memory = kwargs.pop("profile_memory", False)
|
|
with_flops = kwargs.pop("with_flops", False)
|
|
super().__init__(*args, **kwargs)
|
|
self._use_cuda = use_cuda
|
|
self._use_device = use_device
|
|
self._profile_memory = profile_memory
|
|
self._tree_built = False
|
|
self._with_flops = with_flops
|
|
|
|
def _build_tree(self):
|
|
self._populate_cpu_children()
|
|
self._remove_dup_nodes()
|
|
self._set_backward_stacktraces()
|
|
self._tree_built = True
|
|
|
|
def __str__(self):
|
|
return self.table()
|
|
|
|
def _remove_dup_nodes(self):
|
|
while True:
|
|
to_delete = set()
|
|
for idx in range(len(self)):
|
|
if (
|
|
self[idx].cpu_parent is not None
|
|
and self[idx].cpu_parent.name == self[idx].name
|
|
and len(self[idx].cpu_parent.cpu_children) == 1
|
|
):
|
|
self[idx].cpu_parent.cpu_children = self[idx].cpu_children
|
|
self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up
|
|
for ch in self[idx].cpu_children:
|
|
ch.cpu_parent = self[idx].cpu_parent
|
|
to_delete.add(idx)
|
|
if len(to_delete) == 0:
|
|
break
|
|
new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
|
|
self.clear()
|
|
self.extend(new_evts)
|
|
|
|
def _populate_cpu_children(self):
|
|
"""Populate child events into each underlying FunctionEvent object.
|
|
|
|
One event is a child of another if [s1, e1) is inside [s2, e2). Where
|
|
s1 and e1 would be start and end of the child event's interval. And
|
|
s2 and e2 start and end of the parent event's interval
|
|
|
|
Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
|
|
be a parent of two other intervals.
|
|
|
|
If for any reason two intervals intersect only partially, this function
|
|
will not record a parent child relationship between then.
|
|
"""
|
|
# Some events can be async (i.e. start and end on different threads),
|
|
# since it's generally undefined how to attribute children ranges to
|
|
# async ranges, we do not use them when calculating nested ranges and stats
|
|
sync_events = [
|
|
evt
|
|
for evt in self
|
|
if not evt.is_async and evt.device_type == DeviceType.CPU
|
|
]
|
|
events = sorted(
|
|
sync_events,
|
|
key=attrgetter("thread"),
|
|
)
|
|
# Group by both thread and node_id, so that events that happen to have
|
|
# the same thread_id but are from different nodes aren't incorrectly
|
|
# grouped together.
|
|
threads = itertools.groupby(
|
|
events, key=lambda event: (event.thread, event.node_id)
|
|
)
|
|
|
|
# For each thread we keep a stack of current nested parents.
|
|
# We maintain the invariant that each interval is a subset of all other
|
|
# intervals lower in the stack.
|
|
#
|
|
# First we sort the intervals by their start time. Then we iterate over them.
|
|
# Every time we see a new interval we remove several parents from
|
|
# the top until we restore the invariant. Then parent child relationship
|
|
# if recorded if the stack is not empty.
|
|
# Finally we add new interval to the list
|
|
#
|
|
# Algorithm has O(N * log(N)) complexity where N is number of
|
|
# intervals
|
|
for thread_id, thread_events in threads:
|
|
thread_events_ = sorted(
|
|
thread_events,
|
|
key=lambda event: [event.time_range.start, -event.time_range.end],
|
|
)
|
|
current_events: List[FunctionEvent] = []
|
|
cur_end = 0
|
|
for event in thread_events_:
|
|
while len(current_events) > 0:
|
|
parent = current_events[-1]
|
|
if (
|
|
event.time_range.start >= parent.time_range.end
|
|
or event.time_range.end > parent.time_range.end
|
|
):
|
|
# this can't be a parent
|
|
current_events.pop()
|
|
else:
|
|
parent.append_cpu_child(event)
|
|
assert (
|
|
event.cpu_parent is None
|
|
), f"There is already a CPU parent event for {event.key}"
|
|
event.set_cpu_parent(parent)
|
|
break
|
|
|
|
current_events.append(event)
|
|
|
|
def _set_backward_stacktraces(self):
|
|
def bw_parent(evt):
|
|
if evt is None:
|
|
return None
|
|
elif evt.scope == 1: # BACKWARD_FUNCTION
|
|
return evt
|
|
else:
|
|
return bw_parent(evt.cpu_parent)
|
|
|
|
fwd_stacks = {}
|
|
for evt in self:
|
|
if bw_parent(evt) is None and evt.stack is not None:
|
|
t = (evt.sequence_nr, evt.thread)
|
|
if t not in fwd_stacks:
|
|
fwd_stacks[t] = evt.stack
|
|
|
|
for evt in self:
|
|
p = bw_parent(evt)
|
|
if p is not None:
|
|
assert p.fwd_thread is not None
|
|
t = (p.sequence_nr, p.fwd_thread)
|
|
if t in fwd_stacks:
|
|
evt.stack = fwd_stacks[t]
|
|
else:
|
|
evt.stack = []
|
|
|
|
@property
|
|
def self_cpu_time_total(self):
|
|
return sum([event.self_cpu_time_total for event in self])
|
|
|
|
def table(
|
|
self,
|
|
sort_by=None,
|
|
row_limit=100,
|
|
max_src_column_width=75,
|
|
max_name_column_width=55,
|
|
max_shapes_column_width=80,
|
|
header=None,
|
|
top_level_events_only=False,
|
|
):
|
|
"""Print an EventList as a nicely formatted table.
|
|
|
|
Args:
|
|
sort_by (str, optional): Attribute used to sort entries. By default
|
|
they are printed in the same order as they were registered.
|
|
Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
|
|
``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
|
|
``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
|
|
top_level_events_only(bool, optional): Boolean flag to determine the
|
|
selection of events to display. If true, the profiler will only
|
|
display events at top level like top-level invocation of python
|
|
`lstm`, python `add` or other functions, nested events like low-level
|
|
cpu/cuda ops events are omitted for profiler result readability.
|
|
|
|
Returns:
|
|
A string containing the table.
|
|
"""
|
|
return _build_table(
|
|
self,
|
|
sort_by=sort_by,
|
|
row_limit=row_limit,
|
|
max_src_column_width=max_src_column_width,
|
|
max_name_column_width=max_name_column_width,
|
|
max_shapes_column_width=max_shapes_column_width,
|
|
header=header,
|
|
profile_memory=self._profile_memory,
|
|
with_flops=self._with_flops,
|
|
top_level_events_only=top_level_events_only,
|
|
)
|
|
|
|
def export_chrome_trace(self, path):
|
|
"""Export an EventList as a Chrome tracing tools file.
|
|
|
|
The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
|
|
|
|
Args:
|
|
path (str): Path where the trace will be written.
|
|
"""
|
|
import os
|
|
|
|
device_name = "cuda" if not self._use_device else self._use_device
|
|
with open(path, "w") as f:
|
|
chrome_events = []
|
|
next_id = 0
|
|
# Use file IO over using json.dump since JSON dumping is very slow and
|
|
# this technique is proven to give a 4x speedup.
|
|
f.write("[")
|
|
for evt in self:
|
|
if evt.trace_name is None:
|
|
continue
|
|
f.write(
|
|
'{{"name": "{}", '
|
|
'"ph": "X", '
|
|
'"ts": {}, '
|
|
'"dur": {}, '
|
|
'"tid": {}, '
|
|
'"pid": "CPU functions", '
|
|
'"args": {{}}}}, '.format(
|
|
evt.trace_name,
|
|
evt.time_range.start,
|
|
evt.time_range.elapsed_us(),
|
|
evt.thread
|
|
if not evt.is_remote
|
|
else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
|
|
)
|
|
)
|
|
for k in evt.kernels:
|
|
# 's' and 'f' draw Flow arrows from
|
|
# the CPU launch to the GPU kernel
|
|
f.write(
|
|
f'{{"name": "{evt.trace_name}", '
|
|
'"ph": "s", '
|
|
f'"ts": {evt.time_range.start}, '
|
|
f'"tid": {evt.thread}, '
|
|
'"pid": "CPU functions", '
|
|
f'"id": {next_id}, '
|
|
f'"cat": "cpu_to_{device_name}", '
|
|
'"args": {}}, '
|
|
)
|
|
# Note: use torch.profiler to get device kernel trace
|
|
next_id += 1
|
|
if len(self) > 0:
|
|
# remove trailing whitespace and comma
|
|
f.seek(f.tell() - 2, os.SEEK_SET)
|
|
f.truncate()
|
|
f.write("]")
|
|
|
|
def supported_export_stacks_metrics(self):
|
|
return [
|
|
"self_cpu_time_total",
|
|
"self_cuda_time_total",
|
|
"self_privateuse1_time_total",
|
|
]
|
|
|
|
def export_stacks(self, path: str, metric: str):
|
|
if metric not in self.supported_export_stacks_metrics():
|
|
raise ValueError(
|
|
"metric should be one of: "
|
|
+ str(self.supported_export_stacks_metrics())
|
|
)
|
|
translate_table = str.maketrans(" ;\t\n", "____")
|
|
with open(path, "w") as f:
|
|
for evt in self:
|
|
if evt.stack and len(evt.stack) > 0:
|
|
metric_value = getattr(evt, metric)
|
|
if int(metric_value) > 0:
|
|
stack_str = ""
|
|
for entry in reversed(evt.stack):
|
|
stack_str += entry.translate(translate_table)
|
|
stack_str += ";"
|
|
stack_str = stack_str[:-1] + " " + str(int(metric_value))
|
|
f.write(stack_str + "\n")
|
|
|
|
def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
|
|
"""Averages all function events over their keys.
|
|
|
|
Args:
|
|
group_by_input_shapes: group entries by
|
|
(event name, input shapes) rather than just event name.
|
|
This is useful to see which input shapes contribute to the runtime
|
|
the most and may help with size-specific optimizations or
|
|
choosing the best candidates for quantization (aka fitting a roof line)
|
|
|
|
group_by_stack_n: group by top n stack trace entries
|
|
|
|
Returns:
|
|
An EventList containing FunctionEventAvg objects.
|
|
"""
|
|
assert self._tree_built
|
|
stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
|
|
|
|
def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
|
|
key = [
|
|
str(event.key),
|
|
str(event.node_id),
|
|
str(event.device_type),
|
|
str(event.is_legacy),
|
|
]
|
|
if group_by_input_shapes:
|
|
key.append(str(event.input_shapes))
|
|
if group_by_stack_n > 0:
|
|
key += event.stack[:group_by_stack_n]
|
|
return tuple(key)
|
|
|
|
for evt in self:
|
|
stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
|
|
|
|
avg_list = EventList(
|
|
stats.values(),
|
|
use_cuda=self._use_cuda,
|
|
use_device=self._use_device,
|
|
profile_memory=self._profile_memory,
|
|
with_flops=self._with_flops,
|
|
)
|
|
for evt in avg_list:
|
|
evt.stack = evt.stack[:group_by_stack_n]
|
|
if not group_by_input_shapes:
|
|
evt.input_shapes = ""
|
|
return avg_list
|
|
|
|
def total_average(self):
|
|
"""Averages all events.
|
|
|
|
Returns:
|
|
A FunctionEventAvg object.
|
|
"""
|
|
total_stat = FunctionEventAvg()
|
|
for evt in self:
|
|
total_stat += evt
|
|
total_stat.key = None
|
|
total_stat.key = "Total"
|
|
return total_stat
|
|
|
|
|
|
def _format_time(time_us):
|
|
"""Define how to format time in FunctionEvent."""
|
|
US_IN_SECOND = 1000.0 * 1000.0
|
|
US_IN_MS = 1000.0
|
|
if time_us >= US_IN_SECOND:
|
|
return f"{time_us / US_IN_SECOND:.3f}s"
|
|
if time_us >= US_IN_MS:
|
|
return f"{time_us / US_IN_MS:.3f}ms"
|
|
return f"{time_us:.3f}us"
|
|
|
|
|
|
def _format_time_share(time_us, total_time_us):
|
|
"""Define how to format time in FunctionEvent."""
|
|
if total_time_us == 0:
|
|
assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
|
|
return "NaN"
|
|
return f"{time_us * 100.0 / total_time_us:.2f}%"
|
|
|
|
|
|
def _format_memory(nbytes):
|
|
"""Return a formatted memory size string."""
|
|
KB = 1024
|
|
MB = 1024 * KB
|
|
GB = 1024 * MB
|
|
if abs(nbytes) >= GB:
|
|
return f"{nbytes * 1.0 / GB:.2f} Gb"
|
|
elif abs(nbytes) >= MB:
|
|
return f"{nbytes * 1.0 / MB:.2f} Mb"
|
|
elif abs(nbytes) >= KB:
|
|
return f"{nbytes * 1.0 / KB:.2f} Kb"
|
|
else:
|
|
return str(nbytes) + " b"
|
|
|
|
|
|
def _attr_formatter(name):
|
|
return property(lambda self: _format_time(getattr(self, name)))
|
|
|
|
|
|
class FormattedTimesMixin:
|
|
"""Helpers for FunctionEvent and FunctionEventAvg.
|
|
|
|
The subclass should define `*_time_total` and `count` attributes.
|
|
"""
|
|
|
|
cpu_time_str = _attr_formatter("cpu_time")
|
|
cuda_time_str = _attr_formatter("cuda_time")
|
|
privateuse1_time_str = _attr_formatter("privateuse1_time")
|
|
cpu_time_total_str = _attr_formatter("cpu_time_total")
|
|
cuda_time_total_str = _attr_formatter("cuda_time_total")
|
|
privateuse1_time_total_str = _attr_formatter("privateuse1_time_total")
|
|
self_cpu_time_total_str = _attr_formatter("self_cpu_time_total")
|
|
self_cuda_time_total_str = _attr_formatter("self_cuda_time_total")
|
|
self_privateuse1_time_total_str = _attr_formatter("self_privateuse1_time_total")
|
|
|
|
@property
|
|
def cpu_time(self):
|
|
return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined]
|
|
|
|
@property
|
|
def cuda_time(self):
|
|
return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count # type: ignore[attr-defined]
|
|
|
|
@property
|
|
def privateuse1_time(self):
|
|
return 0.0 if self.count == 0 else 1.0 * self.privateuse1_time_total / self.count # type: ignore[attr-defined]
|
|
|
|
|
|
class Interval:
|
|
def __init__(self, start, end):
|
|
self.start = start
|
|
self.end = end
|
|
|
|
def elapsed_us(self):
|
|
r"""
|
|
Returns the length of the interval
|
|
"""
|
|
return self.end - self.start
|
|
|
|
|
|
Kernel = namedtuple("Kernel", ["name", "device", "duration"])
|
|
|
|
|
|
class FunctionEvent(FormattedTimesMixin):
|
|
"""Profiling information about a single function."""
|
|
|
|
def __init__(
|
|
self,
|
|
id,
|
|
name,
|
|
thread,
|
|
start_us,
|
|
end_us,
|
|
fwd_thread=None,
|
|
input_shapes=None,
|
|
stack=None,
|
|
scope=0,
|
|
use_device=None,
|
|
cpu_memory_usage=0,
|
|
cuda_memory_usage=0,
|
|
privateuse1_memory_usage=0,
|
|
is_async=False,
|
|
is_remote=False,
|
|
sequence_nr=-1,
|
|
node_id=-1,
|
|
device_type=DeviceType.CPU,
|
|
device_index=0,
|
|
is_legacy=False,
|
|
flops=None,
|
|
trace_name=None,
|
|
concrete_inputs=None,
|
|
):
|
|
self.id: int = id
|
|
self.node_id: int = node_id
|
|
self.name: str = name
|
|
self.trace_name: str = trace_name
|
|
self.time_range: Interval = Interval(start_us, end_us)
|
|
self.thread: int = thread
|
|
self.fwd_thread: Optional[int] = fwd_thread
|
|
self.kernels: List[Kernel] = []
|
|
self.count: int = 1
|
|
self.cpu_children: List[FunctionEvent] = []
|
|
self.cpu_parent: Optional[FunctionEvent] = None
|
|
self.input_shapes: Tuple[int, ...] = input_shapes
|
|
self.concrete_inputs: List[Any] = concrete_inputs
|
|
self.stack: List = stack
|
|
self.scope: int = scope
|
|
self.use_device: Optional[str] = use_device
|
|
self.cpu_memory_usage: int = cpu_memory_usage
|
|
self.cuda_memory_usage: int = cuda_memory_usage
|
|
self.privateuse1_memory_usage: int = privateuse1_memory_usage
|
|
self.is_async: bool = is_async
|
|
self.is_remote: bool = is_remote
|
|
self.sequence_nr: int = sequence_nr
|
|
self.device_type: DeviceType = device_type
|
|
self.device_index: int = device_index
|
|
self.is_legacy: bool = is_legacy
|
|
self.flops: Optional[int] = flops
|
|
|
|
def append_kernel(self, name, device, duration):
|
|
assert self.device_type == DeviceType.CPU
|
|
self.kernels.append(Kernel(name, device, duration))
|
|
|
|
def append_cpu_child(self, child):
|
|
"""Append a CPU child of type FunctionEvent.
|
|
|
|
One is supposed to append only direct children to the event to have
|
|
correct self cpu time being reported.
|
|
"""
|
|
assert self.device_type == DeviceType.CPU
|
|
assert isinstance(child, FunctionEvent)
|
|
assert child.device_type == DeviceType.CPU
|
|
self.cpu_children.append(child)
|
|
|
|
def set_cpu_parent(self, parent):
|
|
"""Set the immediate CPU parent of type FunctionEvent.
|
|
|
|
One profiling FunctionEvent should have only one CPU parent such that
|
|
the child's range interval is completely inside the parent's. We use
|
|
this connection to determine the event is from top-level op or not.
|
|
"""
|
|
assert self.device_type == DeviceType.CPU
|
|
assert isinstance(parent, FunctionEvent)
|
|
assert parent.device_type == DeviceType.CPU
|
|
self.cpu_parent = parent
|
|
|
|
# Note: async events don't have children, are not used when computing 'self'
|
|
# metrics of other events, have only total cpu time
|
|
@property
|
|
def self_cpu_memory_usage(self):
|
|
if self.is_async or self.device_type != DeviceType.CPU:
|
|
return 0
|
|
return self.cpu_memory_usage - sum(
|
|
[child.cpu_memory_usage for child in self.cpu_children]
|
|
)
|
|
|
|
@property
|
|
def self_cuda_memory_usage(self):
|
|
if self.is_async or self.device_type != DeviceType.CPU:
|
|
return 0
|
|
return self.cuda_memory_usage - sum(
|
|
[child.cuda_memory_usage for child in self.cpu_children]
|
|
)
|
|
|
|
@property
|
|
def self_privateuse1_memory_usage(self):
|
|
if self.is_async or self.device_type != DeviceType.CPU:
|
|
return 0
|
|
return self.privateuse1_memory_usage - sum(
|
|
[child.privateuse1_memory_usage for child in self.cpu_children]
|
|
)
|
|
|
|
@property
|
|
def self_cpu_time_total(self):
|
|
if self.is_async or self.device_type != DeviceType.CPU:
|
|
return 0
|
|
return self.cpu_time_total - sum(
|
|
[child.cpu_time_total for child in self.cpu_children]
|
|
)
|
|
|
|
@property
|
|
def cuda_time_total(self):
|
|
if self.is_async or self.use_device:
|
|
return 0
|
|
if self.device_type == DeviceType.CPU:
|
|
if not self.is_legacy:
|
|
# account for the kernels in the children ops
|
|
return sum(kinfo.duration for kinfo in self.kernels) + sum(
|
|
ch.cuda_time_total for ch in self.cpu_children
|
|
)
|
|
else:
|
|
# each legacy cpu events has a single (fake) kernel
|
|
return sum(kinfo.duration for kinfo in self.kernels)
|
|
else:
|
|
assert self.device_type == DeviceType.CUDA
|
|
return self.time_range.elapsed_us()
|
|
|
|
@property
|
|
def self_cuda_time_total(self):
|
|
if self.is_async or self.use_device:
|
|
return 0
|
|
if self.device_type == DeviceType.CPU:
|
|
return self.cuda_time_total - sum(
|
|
[child.cuda_time_total for child in self.cpu_children]
|
|
)
|
|
else:
|
|
assert self.device_type == DeviceType.CUDA
|
|
return self.cuda_time_total
|
|
|
|
@property
|
|
def cpu_time_total(self):
|
|
if self.device_type == DeviceType.CPU:
|
|
return self.time_range.elapsed_us()
|
|
else:
|
|
return 0
|
|
|
|
@property
|
|
def self_privateuse1_time_total(self):
|
|
if self.is_async or not self.use_device:
|
|
return 0
|
|
if self.device_type == DeviceType.CPU:
|
|
return self.privateuse1_time_total - sum(
|
|
[child.privateuse1_time_total for child in self.cpu_children]
|
|
)
|
|
else:
|
|
assert self.device_type == DeviceType.CUDA
|
|
return self.privateuse1_time_total
|
|
|
|
@property
|
|
def privateuse1_time_total(self):
|
|
if self.is_async or not self.use_device:
|
|
return 0
|
|
if self.device_type == DeviceType.CPU:
|
|
if not self.is_legacy:
|
|
# account for the kernels in the children ops
|
|
return sum(kinfo.duration for kinfo in self.kernels) + sum(
|
|
ch.privateuse1_time_total for ch in self.cpu_children
|
|
)
|
|
else:
|
|
# each legacy cpu events has a single (fake) kernel
|
|
return sum(kinfo.duration for kinfo in self.kernels)
|
|
else:
|
|
assert self.device_type == DeviceType.PrivateUse1
|
|
return self.time_range.elapsed_us()
|
|
|
|
@property
|
|
def key(self):
|
|
return self.name
|
|
|
|
def __repr__(self):
|
|
device_name = "cuda" if not self.use_device else self.use_device
|
|
device_time = (
|
|
self.cuda_time_str if not self.use_device else self.privateuse1_time_str
|
|
)
|
|
device_memory_usage = (
|
|
self.cuda_memory_usage
|
|
if not self.use_device
|
|
else self.privateuse1_memory_usage
|
|
)
|
|
return (
|
|
"<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} "
|
|
"cpu_children={} {}_time={} name={} thread={} input_shapes={} "
|
|
"cpu_memory_usage={} {}_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>".format(
|
|
self.id,
|
|
self.name,
|
|
self.device_type,
|
|
self.node_id,
|
|
self.cpu_time_str,
|
|
self.time_range.start,
|
|
self.time_range.end,
|
|
str([child.id for child in self.cpu_children]),
|
|
device_name,
|
|
device_time,
|
|
self.name,
|
|
self.thread,
|
|
str(self.input_shapes),
|
|
self.cpu_memory_usage,
|
|
device_name,
|
|
device_memory_usage,
|
|
self.is_async,
|
|
self.is_remote,
|
|
self.sequence_nr,
|
|
self.is_legacy,
|
|
)
|
|
)
|
|
|
|
|
|
class FunctionEventAvg(FormattedTimesMixin):
|
|
"""Used to average stats over multiple FunctionEvent objects."""
|
|
|
|
def __init__(self):
|
|
self.key: Optional[str] = None
|
|
self.count: int = 0
|
|
self.node_id: int = 0
|
|
self.is_async: bool = False
|
|
self.is_remote: bool = False
|
|
self.use_device: Optional[str] = None
|
|
self.cpu_time_total: int = 0
|
|
self.cuda_time_total: int = 0
|
|
self.privateuse1_time_total: int = 0
|
|
self.self_cpu_time_total: int = 0
|
|
self.self_cuda_time_total: int = 0
|
|
self.self_privateuse1_time_total: int = 0
|
|
self.input_shapes: Optional[List[List[int]]] = None
|
|
self.stack: Optional[List] = None
|
|
self.scope: Optional[int] = None
|
|
self.cpu_memory_usage: int = 0
|
|
self.cuda_memory_usage: int = 0
|
|
self.privateuse1_memory_usage: int = 0
|
|
self.self_cpu_memory_usage: int = 0
|
|
self.self_cuda_memory_usage: int = 0
|
|
self.self_privateuse1_memory_usage: int = 0
|
|
self.cpu_children: Optional[List[FunctionEvent]] = None
|
|
self.cpu_parent: Optional[FunctionEvent] = None
|
|
self.device_type: DeviceType = DeviceType.CPU
|
|
self.is_legacy: bool = False
|
|
self.flops: int = 0
|
|
|
|
def add(self, other):
|
|
if self.key is None:
|
|
# First function being recorded as part of FunctionEventAvg, propagate
|
|
# fields.
|
|
self.key = other.key
|
|
self.node_id = other.node_id
|
|
self.is_async = other.is_async
|
|
self.is_remote = other.is_remote
|
|
self.cpu_parent = other.cpu_parent
|
|
self.cpu_children = other.cpu_children
|
|
|
|
self.input_shapes = other.input_shapes
|
|
self.stack = other.stack
|
|
self.scope = other.scope
|
|
self.device_type = other.device_type
|
|
self.is_legacy = other.is_legacy
|
|
self.use_device = other.use_device
|
|
|
|
assert isinstance(other, (FunctionEvent, FunctionEventAvg))
|
|
assert other.key == self.key
|
|
self.cpu_time_total += other.cpu_time_total
|
|
self.cuda_time_total += other.cuda_time_total
|
|
self.privateuse1_time_total += other.privateuse1_time_total
|
|
self.self_cpu_time_total += other.self_cpu_time_total
|
|
self.self_cuda_time_total += other.self_cuda_time_total
|
|
self.self_privateuse1_time_total += other.self_privateuse1_time_total
|
|
self.cpu_memory_usage += other.cpu_memory_usage
|
|
self.cuda_memory_usage += other.cuda_memory_usage
|
|
self.privateuse1_memory_usage += other.privateuse1_memory_usage
|
|
self.self_cpu_memory_usage += other.self_cpu_memory_usage
|
|
self.self_cuda_memory_usage += other.self_cuda_memory_usage
|
|
self.self_privateuse1_memory_usage += other.self_privateuse1_memory_usage
|
|
self.count += other.count
|
|
if self.flops is None:
|
|
self.flops = other.flops
|
|
elif other.flops is not None:
|
|
self.flops += other.flops
|
|
return self
|
|
|
|
def __iadd__(self, other):
|
|
return self.add(other)
|
|
|
|
def __repr__(self):
|
|
device_name = "cuda" if not self.use_device else self.use_device
|
|
self_device_time = (
|
|
self.self_cuda_time_total_str
|
|
if not self.use_device
|
|
else self.self_privateuse1_time_total_str
|
|
)
|
|
device_time = (
|
|
self.cuda_time_str if not self.use_device else self.privateuse1_time_str
|
|
)
|
|
device_memory = (
|
|
self.cuda_memory_usage
|
|
if not self.use_device
|
|
else self.privateuse1_memory_usage
|
|
)
|
|
return (
|
|
"<FunctionEventAvg key={} self_cpu_time={} cpu_time={} "
|
|
" self_{}_time={} {}_time={} input_shapes={} "
|
|
"cpu_memory_usage={} {}_memory_usage={}>".format(
|
|
self.key,
|
|
self.self_cpu_time_total_str,
|
|
self.cpu_time_str,
|
|
device_name,
|
|
self_device_time,
|
|
device_name,
|
|
device_time,
|
|
str(self.input_shapes),
|
|
self.cpu_memory_usage,
|
|
device_name,
|
|
device_memory,
|
|
)
|
|
)
|
|
|
|
|
|
class StringTable(defaultdict):
|
|
def __missing__(self, key):
|
|
# manage cases like 't' (demangled to 'unsigned short') separately,
|
|
# for now simply check the length to avoid unexpected results for
|
|
# the short sequences
|
|
self[key] = torch._C._demangle(key) if len(key) > 1 else key
|
|
return self[key]
|
|
|
|
|
|
class MemRecordsAcc:
|
|
"""Acceleration structure for accessing mem_records in interval."""
|
|
|
|
def __init__(self, mem_records):
|
|
self._mem_records = mem_records
|
|
self._start_uses: List[int] = []
|
|
self._indices: List[int] = []
|
|
if len(mem_records) > 0:
|
|
tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)])
|
|
self._start_uses, self._indices = zip(*tmp) # type: ignore[assignment]
|
|
|
|
def in_interval(self, start_us, end_us):
|
|
r"""
|
|
Return all records in the given interval
|
|
"""
|
|
start_idx = bisect.bisect_left(self._start_uses, start_us)
|
|
end_idx = bisect.bisect_right(self._start_uses, end_us)
|
|
for i in range(start_idx, end_idx):
|
|
yield self._mem_records[self._indices[i]]
|
|
|
|
|
|
def _filter_stack_entry(entry):
|
|
filtered_entries = [
|
|
("autograd/__init__", "_make_grads"),
|
|
("autograd/__init__", "backward"),
|
|
("torch/tensor", "backward"),
|
|
("_internal/common_utils", "prof_callable"),
|
|
("_internal/common_utils", "prof_func_call"),
|
|
("_internal/common_utils", "prof_meth_call"),
|
|
]
|
|
return all(not (f[0] in entry and f[1] in entry) for f in filtered_entries)
|
|
|
|
|
|
MEMORY_EVENT_NAME = "[memory]"
|
|
OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
|
|
|
|
|
|
def _filter_name(name):
|
|
# ignoring the following utility ops
|
|
filtered_out_names = [
|
|
MEMORY_EVENT_NAME, # used only for the top-level memory events
|
|
OUT_OF_MEMORY_EVENT_NAME,
|
|
"profiler::_record_function_enter",
|
|
"profiler::_record_function_enter_new",
|
|
"profiler::_record_function_exit",
|
|
"aten::is_leaf",
|
|
"aten::output_nr",
|
|
"aten::_version",
|
|
]
|
|
return name in filtered_out_names
|
|
|
|
|
|
# Demangles and optionally rewrites the provided event name,
|
|
# with_wildcard - whether to replace certain numbered event names
|
|
# with a wildcard name to aggregate them together in the profiler table
|
|
# output
|
|
def _rewrite_name(name, with_wildcard=False):
|
|
string_table = StringTable()
|
|
name = string_table[name]
|
|
if with_wildcard:
|
|
if name.startswith("ProfilerStep#"):
|
|
name = "ProfilerStep*"
|
|
return name
|
|
|
|
|
|
def _build_table(
|
|
events,
|
|
sort_by=None,
|
|
header=None,
|
|
row_limit=100,
|
|
max_src_column_width=75,
|
|
max_name_column_width=55,
|
|
max_shapes_column_width=80,
|
|
with_flops=False,
|
|
profile_memory=False,
|
|
top_level_events_only=False,
|
|
):
|
|
"""Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
|
|
if len(events) == 0:
|
|
return ""
|
|
|
|
has_cuda_time = any(event.self_cuda_time_total > 0 for event in events)
|
|
has_cuda_mem = any(event.self_cuda_memory_usage > 0 for event in events)
|
|
has_privateuse1_time = any(
|
|
event.self_privateuse1_time_total > 0 for event in events
|
|
)
|
|
has_privateuse1_mem = any(
|
|
event.self_privateuse1_memory_usage > 0 for event in events
|
|
)
|
|
use_device = events[0].use_device
|
|
if not use_device and (has_privateuse1_mem or has_privateuse1_time):
|
|
raise RuntimeError(
|
|
"use_device is None, but there is private device performance data."
|
|
)
|
|
|
|
has_input_shapes = any(
|
|
(event.input_shapes is not None and len(event.input_shapes) > 0)
|
|
for event in events
|
|
)
|
|
|
|
if sort_by is not None:
|
|
events = EventList(
|
|
sorted(events, key=lambda evt: getattr(evt, sort_by), reverse=True),
|
|
use_cuda=has_cuda_time,
|
|
use_device=use_device,
|
|
profile_memory=profile_memory,
|
|
with_flops=with_flops,
|
|
)
|
|
|
|
name_column_width = max([len(evt.key) for evt in events]) + 4
|
|
if max_name_column_width is not None:
|
|
name_column_width = min(name_column_width, max_name_column_width)
|
|
|
|
shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
|
|
if max_shapes_column_width is not None:
|
|
shapes_column_width = min(shapes_column_width, max_shapes_column_width)
|
|
|
|
DEFAULT_COLUMN_WIDTH = 12
|
|
flops_column_width = DEFAULT_COLUMN_WIDTH
|
|
|
|
src_column_width = None
|
|
stacks = []
|
|
for evt in events:
|
|
if evt.stack is not None and len(evt.stack) > 0:
|
|
stacks.append(evt.stack)
|
|
has_stack = len(stacks) > 0
|
|
if has_stack:
|
|
src_column_width = (
|
|
max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
|
|
)
|
|
if max_src_column_width is not None:
|
|
src_column_width = min(src_column_width, max_src_column_width)
|
|
|
|
headers = [
|
|
"Name",
|
|
"Self CPU %",
|
|
"Self CPU",
|
|
"CPU total %",
|
|
"CPU total",
|
|
"CPU time avg",
|
|
]
|
|
if has_cuda_time:
|
|
headers.extend(
|
|
[
|
|
"Self CUDA",
|
|
"Self CUDA %",
|
|
"CUDA total",
|
|
"CUDA time avg",
|
|
]
|
|
)
|
|
if has_privateuse1_time:
|
|
privateuse1 = use_device.upper()
|
|
headers.extend(
|
|
[
|
|
f"Self {privateuse1}",
|
|
f"Self {privateuse1} %",
|
|
f"{privateuse1} total",
|
|
f"{privateuse1} time avg",
|
|
]
|
|
)
|
|
if profile_memory:
|
|
headers.extend(
|
|
[
|
|
"CPU Mem",
|
|
"Self CPU Mem",
|
|
]
|
|
)
|
|
if has_cuda_mem:
|
|
headers.extend(
|
|
[
|
|
"CUDA Mem",
|
|
"Self CUDA Mem",
|
|
]
|
|
)
|
|
if has_privateuse1_mem:
|
|
privateuse1 = use_device.upper()
|
|
headers.extend(
|
|
[
|
|
f"{privateuse1} Mem",
|
|
f"Self {privateuse1} Mem",
|
|
]
|
|
)
|
|
headers.append("# of Calls")
|
|
# Only append Node ID if any event has a valid (>= 0) Node ID
|
|
append_node_id = any(evt.node_id != -1 for evt in events)
|
|
if append_node_id:
|
|
headers.append("Node ID")
|
|
|
|
# Have to use a list because nonlocal is Py3 only...
|
|
SPACING_SIZE = 2
|
|
row_format_lst = [""]
|
|
header_sep_lst = [""]
|
|
line_length_lst = [-SPACING_SIZE]
|
|
MAX_STACK_ENTRY = 5
|
|
|
|
def add_column(padding, text_dir=">"):
|
|
row_format_lst[0] += (
|
|
"{: " + text_dir + str(padding) + "}" + (" " * SPACING_SIZE)
|
|
)
|
|
header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE)
|
|
line_length_lst[0] += padding + SPACING_SIZE
|
|
|
|
def auto_scale_flops(flops):
|
|
flop_headers = [
|
|
"FLOPs",
|
|
"KFLOPs",
|
|
"MFLOPs",
|
|
"GFLOPs",
|
|
"TFLOPs",
|
|
"PFLOPs",
|
|
]
|
|
assert flops > 0
|
|
log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
|
|
assert log_flops >= 0 and log_flops < len(flop_headers)
|
|
return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
|
|
|
|
add_column(name_column_width)
|
|
for _ in headers[1:]:
|
|
add_column(DEFAULT_COLUMN_WIDTH)
|
|
|
|
if has_input_shapes:
|
|
headers.append("Input Shapes")
|
|
add_column(shapes_column_width)
|
|
|
|
if has_stack:
|
|
headers.append("Source Location")
|
|
add_column(src_column_width, text_dir="<")
|
|
|
|
if with_flops:
|
|
# Auto-scaling of flops header
|
|
raw_flops = []
|
|
for evt in events:
|
|
if evt.flops > 0:
|
|
raw_flops.append(evt.flops)
|
|
if len(raw_flops) != 0:
|
|
(flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
|
|
headers.append(f"Total {flops_header}")
|
|
add_column(flops_column_width)
|
|
else:
|
|
with_flops = False # can't find any valid flops
|
|
|
|
row_format = row_format_lst[0]
|
|
header_sep = header_sep_lst[0]
|
|
line_length = line_length_lst[0]
|
|
add_column = None # type: ignore[assignment]
|
|
|
|
# Have to use a list because nonlocal is Py3 only...
|
|
result = []
|
|
|
|
def append(s):
|
|
result.append(s)
|
|
result.append("\n") # Yes, newline after the end as well
|
|
|
|
sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
|
|
sum_self_cuda_time_total = 0
|
|
sum_self_privateuse1_time_total = 0
|
|
for evt in events:
|
|
if evt.device_type == DeviceType.CPU:
|
|
# in legacy profiler, kernel info is stored in cpu events
|
|
if evt.is_legacy:
|
|
if not use_device:
|
|
sum_self_cuda_time_total += evt.self_cuda_time_total
|
|
else:
|
|
sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
|
|
elif evt.device_type == DeviceType.CUDA:
|
|
# in kineto profiler, there're events with the correct device type (e.g. CUDA)
|
|
sum_self_cuda_time_total += evt.self_cuda_time_total
|
|
elif evt.device_type == DeviceType.PrivateUse1:
|
|
sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
|
|
|
|
# Actual printing
|
|
if header is not None:
|
|
append("=" * line_length)
|
|
append(header)
|
|
if top_level_events_only:
|
|
append("=" * line_length)
|
|
append("This report only display top-level ops statistics")
|
|
append(header_sep)
|
|
append(row_format.format(*headers))
|
|
|
|
append(header_sep)
|
|
|
|
def trim_path(path, src_column_width):
|
|
if len(path) > src_column_width:
|
|
offset = len(path) - src_column_width
|
|
path = path[offset:]
|
|
if len(path) > 3:
|
|
path = "..." + path[3:]
|
|
return path
|
|
|
|
event_limit = 0
|
|
for evt in events:
|
|
if event_limit == row_limit:
|
|
break
|
|
if top_level_events_only and evt.cpu_parent is not None:
|
|
continue
|
|
else:
|
|
event_limit += 1
|
|
name = evt.key
|
|
if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
|
|
name = name[: (max_name_column_width - 3)] + "..."
|
|
row_values = [
|
|
name,
|
|
# Self CPU total %, 0 for async events.
|
|
_format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total),
|
|
evt.self_cpu_time_total_str, # Self CPU total
|
|
# CPU total %, 0 for async events.
|
|
_format_time_share(evt.cpu_time_total, sum_self_cpu_time_total)
|
|
if not evt.is_async
|
|
else 0,
|
|
evt.cpu_time_total_str, # CPU total
|
|
evt.cpu_time_str, # CPU time avg
|
|
]
|
|
if has_cuda_time:
|
|
row_values.extend(
|
|
[
|
|
evt.self_cuda_time_total_str,
|
|
# CUDA time total %
|
|
_format_time_share(
|
|
evt.self_cuda_time_total, sum_self_cuda_time_total
|
|
),
|
|
evt.cuda_time_total_str,
|
|
evt.cuda_time_str, # Cuda time avg
|
|
]
|
|
)
|
|
if has_privateuse1_time:
|
|
row_values.extend(
|
|
[
|
|
evt.self_privateuse1_time_total_str,
|
|
# PrivateUse1 time total %
|
|
_format_time_share(
|
|
evt.self_privateuse1_time_total, sum_self_privateuse1_time_total
|
|
),
|
|
evt.privateuse1_time_total_str,
|
|
evt.privateuse1_time_str, # PrivateUse1 time avg
|
|
]
|
|
)
|
|
if profile_memory:
|
|
row_values.extend(
|
|
[
|
|
# CPU Mem Total
|
|
_format_memory(evt.cpu_memory_usage),
|
|
# Self CPU Mem Total
|
|
_format_memory(evt.self_cpu_memory_usage),
|
|
]
|
|
)
|
|
if has_cuda_mem:
|
|
row_values.extend(
|
|
[
|
|
# CUDA Mem Total
|
|
_format_memory(evt.cuda_memory_usage),
|
|
# Self CUDA Mem Total
|
|
_format_memory(evt.self_cuda_memory_usage),
|
|
]
|
|
)
|
|
if has_privateuse1_mem:
|
|
row_values.extend(
|
|
[
|
|
# PrivateUse1 Mem Total
|
|
_format_memory(evt.privateuse1_memory_usage),
|
|
# Self PrivateUse1 Mem Total
|
|
_format_memory(evt.self_privateuse1_memory_usage),
|
|
]
|
|
)
|
|
row_values.append(
|
|
evt.count, # Number of calls
|
|
)
|
|
|
|
if append_node_id:
|
|
row_values.append(evt.node_id)
|
|
if has_input_shapes:
|
|
row_values.append(str(evt.input_shapes)[:shapes_column_width])
|
|
if with_flops:
|
|
if evt.flops <= 0:
|
|
row_values.append("--")
|
|
else:
|
|
row_values.append(f"{evt.flops * flops_scale:8.3f}") # type: ignore[possibly-undefined]
|
|
if has_stack:
|
|
src_field = ""
|
|
if len(evt.stack) > 0:
|
|
src_field = trim_path(evt.stack[0], src_column_width)
|
|
row_values.append(src_field)
|
|
append(row_format.format(*row_values))
|
|
|
|
if has_stack:
|
|
empty_headers = [""] * (len(headers) - 1)
|
|
for entry in evt.stack[1:MAX_STACK_ENTRY]:
|
|
append(
|
|
row_format.format(
|
|
*(empty_headers + [trim_path(entry, src_column_width)])
|
|
)
|
|
)
|
|
empty_headers.append("")
|
|
append(row_format.format(*empty_headers))
|
|
|
|
append(header_sep)
|
|
append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")
|
|
if has_cuda_time:
|
|
append(f"Self CUDA time total: {_format_time(sum_self_cuda_time_total)}")
|
|
if has_privateuse1_time:
|
|
append(
|
|
f"Self {use_device.upper()} time total: {_format_time(sum_self_privateuse1_time_total)}"
|
|
)
|
|
return "".join(result)
|