From f978d6b9b47f0b3154d08d5b8d970987d0d62e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Thu, 25 Jan 2024 18:37:59 +0100 Subject: [PATCH] redo tests for prefetching --- .../qdp-xeonmax-simple-prefetch-perf.svg | 2541 --------------- ...tcb1-tcj2-tmul8-wl4294967296-cs8388608.csv | 6 + ...-xeonmax-simple-prefetch-weakwait-perf.svg | 2892 +++++++++++------ 3 files changed, 1844 insertions(+), 3595 deletions(-) delete mode 100644 qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-perf.svg create mode 100644 qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-tca4-tcb1-tcj2-tmul8-wl4294967296-cs8388608.csv diff --git a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-perf.svg b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-perf.svg deleted file mode 100644 index 1d0dc86..0000000 --- a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-perf.svg +++ /dev/null @@ -1,2541 +0,0 @@ - - - - - - - - - - - - - - -Flame Graph - -Reset Zoom -Search -ic - - - -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - - - -[[kernel.kallsyms]] (52,582,122 samples, 0.09%) - - - -[[kernel.kallsyms]] (33,581,259 samples, 0.06%) - - - -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - - - -[[kernel.kallsyms]] (15,074,171 samples, 0.03%) - - - -[[kernel.kallsyms]] (8,457,723 samples, 0.02%) - - - -__GI_mprotect (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - - - -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -syscall (19,162,146 samples, 0.03%) - - - -[[kernel.kallsyms]] (64,992,482 samples, 0.12%) - - - -__libc_start_main_impl (47,034,233,602 samples, 83.69%) -__libc_start_main_impl - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (14,696,092 samples, 0.03%) - - - -dsacache::CacheData::~CacheData (27,978,244,770 samples, 49.78%) -dsacache::CacheData::~CacheData - - -unsigned long std::uniform_int_distribution<unsigned long>::operator (1,224,461,441 samples, 2.18%) -u.. - - -[[kernel.kallsyms]] (19,939,586 samples, 0.04%) - - - -[[kernel.kallsyms]] (78,676,124 samples, 0.14%) - - - -auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (58,263,661 samples, 0.10%) - - - -[[kernel.kallsyms]] (9,614,253 samples, 0.02%) - - - -dml_wait_busy_poll (27,883,144,753 samples, 49.61%) -dml_wait_busy_poll - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,469,680 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -unsigned long std::uniform_int_distribution<unsigned long>::operator (8,778,281,021 samples, 15.62%) -unsigned long std::unifo.. - - -device_parse (16,212,211 samples, 0.03%) - - - -__GI_mprotect (50,154,117 samples, 0.09%) - - - -[[kernel.kallsyms]] (6,005,283,229 samples, 10.69%) -[[kernel.kallsy.. - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,143,140 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -[[kernel.kallsyms]] (26,297,644 samples, 0.05%) - - - -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - - - -_int_malloc (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (9,113,218 samples, 0.02%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - - - -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,004,974 samples, 0.01%) - - - -[[kernel.kallsyms]] (20,280,664 samples, 0.04%) - - - -dml::core::dispatcher::hw_dispatcher::initialize_hw (23,965,836 samples, 0.04%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (50,149,163 samples, 0.09%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (256,851,116 samples, 0.46%) - - - -[[kernel.kallsyms]] (7,604,051 samples, 0.01%) - - - -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - - - -__GI__IO_file_open (17,566,875 samples, 0.03%) - - - -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - - - -[[kernel.kallsyms]] (14,391,559 samples, 0.03%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -dml::core::dispatcher::hw_queue::initialize_new_queue (4,914,889 samples, 0.01%) - - - -grow_heap (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (26,783,041 samples, 0.05%) - - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - - - -[[kernel.kallsyms]] (18,034,607 samples, 0.03%) - - - -dsacache::CacheData::WaitOnCompletion (27,884,008,166 samples, 49.61%) -dsacache::CacheData::WaitOnCompletion - - -[[kernel.kallsyms]] (16,912,912 samples, 0.03%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -_int_memalign (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - - - -[[kernel.kallsyms]] (7,316,287,736 samples, 13.02%) -[[kernel.kallsyms]] - - -scan_b (201,442,301 samples, 0.36%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) - - - -dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (24,636,857 samples, 0.04%) - - - -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - - - -_int_malloc (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -sysmalloc (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - - - -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - - - -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - - - -main (47,031,727,484 samples, 83.69%) -main - - -Filter<unsigned long, LT, (3,458,098,462 samples, 6.15%) -Filter<u.. - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (14,838,521 samples, 0.03%) - - - -[[kernel.kallsyms]] (8,356,058 samples, 0.01%) - - - -accfg_wq_get_first (16,212,211 samples, 0.03%) - - - -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) - - - -[[kernel.kallsyms]] (90,779,509 samples, 0.16%) - - - -[[kernel.kallsyms]] (9,513,431 samples, 0.02%) - - - -[[kernel.kallsyms]] (15,074,171 samples, 0.03%) - - - -__GI___nptl_deallocate_stack (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - - - -[[kernel.kallsyms]] (8,817,500 samples, 0.02%) - - - -dsacache::Cache::ExecuteCopy (78,330,121 samples, 0.14%) - - - -sysmalloc (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (7,229,598 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,370,691 samples, 0.01%) - - - -[[kernel.kallsyms]] (179,023,692 samples, 0.32%) - - - -dsacache::Cache::Clear (27,978,244,770 samples, 49.78%) -dsacache::Cache::Clear - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -[[kernel.kallsyms]] (22,333,273 samples, 0.04%) - - - -[[kernel.kallsyms]] (16,822,016 samples, 0.03%) - - - -[[kernel.kallsyms]] (10,378,720 samples, 0.02%) - - - -allocate_stack (12,724,581 samples, 0.02%) - - - -[[kernel.kallsyms]] (12,973,510 samples, 0.02%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - - - -[[kernel.kallsyms]] (12,330,911 samples, 0.02%) - - - -aggr_j (2,905,466,537 samples, 5.17%) -aggr_j - - -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -numa_alloc_onnode (146,910,655 samples, 0.26%) - - - -[[kernel.kallsyms]] (21,152,354 samples, 0.04%) - - - -dml::detail::ml::task<std::allocator<unsigned char> >::task (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (45,659,261 samples, 0.08%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,104,752 samples, 0.01%) - - - -_int_memalign (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (7,169,816 samples, 0.01%) - - - -[[kernel.kallsyms]] (48,567,551 samples, 0.09%) - - - -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - - - -auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (60,893,681 samples, 0.11%) - - - -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - - - -__nptl_free_stacks (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - - - -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_nodes (27,978,244,770 samples, 49.78%) -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. - - -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - - - -__GI___libc_read (6,155,254 samples, 0.01%) - - - -__GI___mmap64 (93,967,068 samples, 0.17%) - - - -mbind (26,878,702 samples, 0.05%) - - - -[[kernel.kallsyms]] (12,902,294 samples, 0.02%) - - - -void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::destroy<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) -void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<un.. - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - - - -Aggregation<unsigned long, Sum, (2,493,804,416 samples, 4.44%) -Aggre.. - - -[[kernel.kallsyms]] (49,739,189 samples, 0.09%) - - - -[[kernel.kallsyms]] (54,854,853 samples, 0.10%) - - - -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - - - -[[kernel.kallsyms]] (43,636,433 samples, 0.08%) - - - -numa_node_size64 (59,279,404 samples, 0.11%) - - - -[[kernel.kallsyms]] (76,833,477 samples, 0.14%) - - - -_mm512_stream_load_si512 (1,931,677,068 samples, 3.44%) -_mm.. - - -[[stack]] (1,230,514,799 samples, 2.19%) -[.. - - -[[kernel.kallsyms]] (10,494,303 samples, 0.02%) - - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -[[kernel.kallsyms]] (24,405,965 samples, 0.04%) - - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (9,512,265 samples, 0.02%) - - - -dsacache::Cache::Access (319,122,983 samples, 0.57%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -[[kernel.kallsyms]] (78,924,962 samples, 0.14%) - - - -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - - - -[[kernel.kallsyms]] (22,459,290 samples, 0.04%) - - - -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (12,108,425 samples, 0.02%) - - - -[[kernel.kallsyms]] (256,323,849 samples, 0.46%) - - - -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) - - - -__GI___libc_read (6,155,254 samples, 0.01%) - - - -operator new (46,717,102 samples, 0.08%) - - - -dml::core::hardware_device::submit (25,560,478 samples, 0.05%) - - - -_int_malloc (45,810,524 samples, 0.08%) - - - -[[kernel.kallsyms]] (14,391,559 samples, 0.03%) - - - -std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::clear (27,978,244,770 samples, 49.78%) -std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*.. - - -dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) - - - -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - - - -[[kernel.kallsyms]] (15,189,233 samples, 0.03%) - - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -__GI_munmap (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (13,011,844 samples, 0.02%) - - - -[[kernel.kallsyms]] (9,512,265 samples, 0.02%) - - - -operator new (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (91,643,182 samples, 0.16%) - - - -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (67,470,988 samples, 0.12%) - - - -[[kernel.kallsyms]] (7,929,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,785,217 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - - - -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,030,285 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,450,133 samples, 0.01%) - - - -accfg_get_param_long (9,811,793 samples, 0.02%) - - - -[[kernel.kallsyms]] (29,377,009 samples, 0.05%) - - - -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,362,923 samples, 0.02%) - - - -_IO_new_file_underflow (6,564,990 samples, 0.01%) - - - -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - - - -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (6,357,006 samples, 0.01%) - - - -[[kernel.kallsyms]] (16,010,002 samples, 0.03%) - - - -__libc_start_call_main (47,034,233,602 samples, 83.69%) -__libc_start_call_main - - -dsacache::Cache::GetCacheNode (28,516,368 samples, 0.05%) - - - -void fill_mt<unsigned long> (17,603,955,046 samples, 31.32%) -void fill_mt<unsigned long> - - -auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) - - - -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - - - -__GI___libc_read (15,623,013 samples, 0.03%) - - - -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (46,717,102 samples, 0.08%) - - - -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - - - -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -_IO_new_fclose (8,909,079 samples, 0.02%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - - - -[[kernel.kallsyms]] (155,318,192 samples, 0.28%) - - - -add_wq (15,236,807 samples, 0.03%) - - - -openat (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -std::thread::_M_start_thread (12,724,581 samples, 0.02%) - - - -sysmalloc (45,810,524 samples, 0.08%) - - - -[[kernel.kallsyms]] (42,101,327 samples, 0.07%) - - - -[[kernel.kallsyms]] (5,553,168 samples, 0.01%) - - - -advise_stack_range (11,778,485 samples, 0.02%) - - - -_int_memalign (46,717,102 samples, 0.08%) - - - -[[kernel.kallsyms]] (88,186,671 samples, 0.16%) - - - -dsacache::Cache::AllocOnNode (206,190,059 samples, 0.37%) - - - -syscall (52,197,849 samples, 0.09%) - - - -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - - - -_mm512_cmplt_epi64_mask (26,906,506 samples, 0.05%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,869,681 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,323,204,071 samples, 13.03%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - - - -start_thread (6,580,772,960 samples, 11.71%) -start_thread - - -dml::detail::ml::task<std::allocator<unsigned char> >::task (46,717,102 samples, 0.08%) - - - -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (1,623,195,042 samples, 2.89%) -st.. - - -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - - - -[[kernel.kallsyms]] (16,480,213 samples, 0.03%) - - - -mbind (52,197,849 samples, 0.09%) - - - -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,817,500 samples, 0.02%) - - - -[[kernel.kallsyms]] (18,346,934 samples, 0.03%) - - - -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - - - -Sum<unsigned long>::simd_agg (414,838,602 samples, 0.74%) - - - -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - - - -__GI_munmap (80,579,994 samples, 0.14%) - - - -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,002,702,867 samples, 8.90%) -[[kernel.kal.. - - -[[kernel.kallsyms]] (32,408,072 samples, 0.06%) - - - -[[kernel.kallsyms]] (9,123,981 samples, 0.02%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -__GI_munmap (6,540,584 samples, 0.01%) - - - -_int_malloc (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - - - -__GI___mmap64 (32,804,268 samples, 0.06%) - - - -[[kernel.kallsyms]] (85,434,447 samples, 0.15%) - - - -Vector_Loader<unsigned long, (2,993,062,044 samples, 5.33%) -Vector.. - - -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -grow_heap (44,111,847 samples, 0.08%) - - - -_mid_memalign (35,828,488 samples, 0.06%) - - - -_start (47,036,380,628 samples, 83.69%) -_start - - -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - - - -dsacache::CacheData::Deallocate (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - - - -[[kernel.kallsyms]] (8,551,434 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,543,739 samples, 0.01%) - - - -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -_IO_new_file_fopen (17,566,875 samples, 0.03%) - - - -__fopen_internal (17,566,875 samples, 0.03%) - - - -unsigned long std::uniform_int_distribution<unsigned long>::operator (1,226,127,583 samples, 2.18%) -u.. - - -grow_heap (34,047,863 samples, 0.06%) - - - -[[kernel.kallsyms]] (10,376,330 samples, 0.02%) - - - -[[kernel.kallsyms]] (35,593,800 samples, 0.06%) - - - -std::thread::thread<void (12,724,581 samples, 0.02%) - - - -[[kernel.kallsyms]] (41,592,934 samples, 0.07%) - - - -dsacache::Cache::Access (199,605,584 samples, 0.36%) - - - -[[kernel.kallsyms]] (8,630,962 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -_mid_memalign (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -dsacache::CacheData::WaitOnCompletion (27,883,144,753 samples, 49.61%) -dsacache::CacheData::WaitOnCompletion - - -[[kernel.kallsyms]] (5,109,987 samples, 0.01%) - - - -[[kernel.kallsyms]] (17,265,048 samples, 0.03%) - - - -dml::detail::ml::task<std::allocator<unsigned char> >::task (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -__GI_munmap (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -[[kernel.kallsyms]] (9,919,156 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,020,276 samples, 0.01%) - - - -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - - - -__GI___close_nocancel (7,344,388 samples, 0.01%) - - - -QDPBench (56,173,802,251 samples, 99.95%) -QDPBench - - -dml::core::dispatcher::hw_device::initialize_new_device (21,127,100 samples, 0.04%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -operator new (10,813,731 samples, 0.02%) - - - -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (5,144,892,302 samples, 9.15%) -unsigned int .. - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,400,747 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -[unknown] (1,234,157,360 samples, 2.20%) -[.. - - -[[kernel.kallsyms]] (6,357,006 samples, 0.01%) - - - -dml::detail::ml::impl::hardware::submit (25,560,478 samples, 0.05%) - - - -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] - - -_IO_new_file_close_it (7,344,388 samples, 0.01%) - - - -LT<unsigned long>::simd_filter (26,906,506 samples, 0.05%) - - - -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (1,015,021,080 samples, 1.81%) -u.. - - -__GI___mmap64 (93,967,068 samples, 0.17%) - - - -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - - - -[[kernel.kallsyms]] (258,139,562 samples, 0.46%) - - - -__libc_openat64 (6,835,870 samples, 0.01%) - - - -queue_stack (6,540,584 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,040,629 samples, 0.02%) - - - -[[kernel.kallsyms]] (183,101,380 samples, 0.33%) - - - -__libc_open64 (17,566,875 samples, 0.03%) - - - -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - - - -[[kernel.kallsyms]] (5,302,335 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,866,122 samples, 0.02%) - - - -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (46,717,102 samples, 0.08%) - - - -_mm512_mask_add_epi64 (414,838,602 samples, 0.74%) - - - -std::pair<unsigned char* const, dsacache::CacheData>::~pair (27,978,244,770 samples, 49.78%) -std::pair<unsigned char* const, dsacache::CacheData>::~pair - - -[[kernel.kallsyms]] (33,581,259 samples, 0.06%) - - - -[[kernel.kallsyms]] (5,360,812 samples, 0.01%) - - - -[[kernel.kallsyms]] (32,112,996 samples, 0.06%) - - - -[[kernel.kallsyms]] (16,791,097 samples, 0.03%) - - - -[[kernel.kallsyms]] (6,799,118 samples, 0.01%) - - - -sudo (16,406,659 samples, 0.03%) - - - -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_node (27,978,244,770 samples, 49.78%) -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. - - -wqs_init (16,212,211 samples, 0.03%) - - - -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - - - -[[kernel.kallsyms]] (38,027,005 samples, 0.07%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -_mid_memalign (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (16,791,097 samples, 0.03%) - - - -operator new (52,769,643 samples, 0.09%) - - - -std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::clear (27,978,244,770 samples, 49.78%) -std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheD.. - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -dsacache::CacheData::WaitOnCompletion (84,126,226 samples, 0.15%) - - - -dsacache::Cache::GetCacheNode (23,082,624 samples, 0.04%) - - - -[[kernel.kallsyms]] (93,216,145 samples, 0.17%) - - - -dsacache::Cache::SubmitTask (25,560,478 samples, 0.05%) - - - -void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) -void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > - - -__GI___libc_read (15,623,013 samples, 0.03%) - - - -[[kernel.kallsyms]] (89,913,865 samples, 0.16%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -Vector_Loader<unsigned long, (1,931,677,068 samples, 3.44%) -Vec.. - - -[[kernel.kallsyms]] (11,089,814 samples, 0.02%) - - - -__GI_madvise (11,778,485 samples, 0.02%) - - - -syscall (26,878,702 samples, 0.05%) - - - -[[kernel.kallsyms]] (16,071,115 samples, 0.03%) - - - -[[kernel.kallsyms]] (19,961,046 samples, 0.04%) - - - -_mid_memalign (46,717,102 samples, 0.08%) - - - -syscall (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (19,107,386 samples, 0.03%) - - - -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - - - -decltype (12,724,581 samples, 0.02%) - - - -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (13,514,538 samples, 0.02%) - - - -sum_check (1,371,232,114 samples, 2.44%) -su.. - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -__GI___getdelim (22,226,508 samples, 0.04%) - - - -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - - - -[[kernel.kallsyms]] (27,648,058 samples, 0.05%) - - - -sysmalloc (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - - - -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - - - -unsigned long std::uniform_int_distribution<unsigned long>::operator (7,679,240,842 samples, 13.66%) -unsigned long std::u.. - - -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,430,139 samples, 0.05%) - - - -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (6,347,448 samples, 0.01%) - - - -clone3 (6,606,257,445 samples, 11.75%) -clone3 - - -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - - - -__pthread_create_2_1 (12,724,581 samples, 0.02%) - - - -[[kernel.kallsyms]] (20,727,544 samples, 0.04%) - - - -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (30,153,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - - - -auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (46,717,102 samples, 0.08%) - - - -dml::core::dispatcher::hw_dispatcher::get_instance (25,315,981 samples, 0.05%) - - - -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - - - -__GI_mprotect (34,047,863 samples, 0.06%) - - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -__GI_mprotect (44,111,847 samples, 0.08%) - - - -numa_node_size64 (17,944,285 samples, 0.03%) - - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - - - -scan_a (3,462,002,037 samples, 6.16%) -scan_a - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -[[kernel.kallsyms]] (47,784,777 samples, 0.09%) - - - -[[kernel.kallsyms]] (11,243,448 samples, 0.02%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (9,861,578 samples, 0.02%) - - - -dsacache::Cache::ExecuteCopy (49,871,202 samples, 0.09%) - - - -grow_heap (50,154,117 samples, 0.09%) - - - -[[kernel.kallsyms]] (24,140,143 samples, 0.04%) - - - -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - - - -[[kernel.kallsyms]] (7,318,881,921 samples, 13.02%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - - - -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (4,015,533,329 samples, 7.14%) -std::mers.. - - -[[kernel.kallsyms]] (10,376,330 samples, 0.02%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (9,861,578 samples, 0.02%) - - - -[libstdc++.so.6.0.32] (6,568,910,875 samples, 11.69%) -[libstdc++.so.6.0.. - - -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - - - -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - - - -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - - - -numa_alloc_onnode (59,682,970 samples, 0.11%) - - - -[[kernel.kallsyms]] (45,789,329 samples, 0.08%) - - - -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - - - -dml::core::dispatcher::hw_dispatcher::hw_dispatcher (23,965,836 samples, 0.04%) - - - -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - - - -_int_memalign (10,813,731 samples, 0.02%) - - - -__sysfs_device_parse (16,212,211 samples, 0.03%) - - - -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (258,083,965 samples, 0.46%) - - - -[[kernel.kallsyms]] (5,333,154 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,542,336 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,240,609,876 samples, 12.88%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) - - - -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - - - -[[kernel.kallsyms]] (13,726,772 samples, 0.02%) - - - -[[kernel.kallsyms]] (4,860,205 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - - - -[[kernel.kallsyms]] (33,198,896 samples, 0.06%) - - - -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - - - -[[kernel.kallsyms]] (31,728,822 samples, 0.06%) - - - -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - - - -[[kernel.kallsyms]] (82,999,130 samples, 0.15%) - - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -[[kernel.kallsyms]] (4,837,589 samples, 0.01%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - - - -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (10,813,731 samples, 0.02%) - - - -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - - - -__GI___getdelim (7,608,450 samples, 0.01%) - - - -dsacache::Cache::AllocOnNode (77,976,052 samples, 0.14%) - - - -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get (27,883,144,753 samples, 49.61%) -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get - - -[[kernel.kallsyms]] (13,547,223 samples, 0.02%) - - - -[[kernel.kallsyms]] (256,851,116 samples, 0.46%) - - - -[[kernel.kallsyms]] (89,913,865 samples, 0.16%) - - - -void std::allocator_traits<std::allocator<std::thread> >::construct<std::thread, void (12,724,581 samples, 0.02%) - - - -std::thread::join (6,839,397 samples, 0.01%) - - - -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - - - -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - - - -__pthread_clockjoin_ex (6,839,397 samples, 0.01%) - - - -_mm512_stream_load_si512 (2,993,062,044 samples, 5.33%) -_mm512.. - - -[[kernel.kallsyms]] (12,973,510 samples, 0.02%) - - - -dml::detail::ml::task<std::allocator<unsigned char> >::task (52,769,643 samples, 0.09%) - - - -[[kernel.kallsyms]] (90,779,509 samples, 0.16%) - - - -[[kernel.kallsyms]] (14,066,357 samples, 0.03%) - - - -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - - - -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - - - -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - - - -sh (5,618,491 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,000,961,265 samples, 10.68%) -[[kernel.kallsy.. - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -_IO_new_file_underflow (19,746,317 samples, 0.04%) - - - -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - - - -[[kernel.kallsyms]] (43,636,433 samples, 0.08%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (46,717,102 samples, 0.08%) - - - -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - - - -[[kernel.kallsyms]] (23,722,633 samples, 0.04%) - - - -[[kernel.kallsyms]] (8,649,622 samples, 0.02%) - - - -all (56,200,887,845 samples, 100%) - - - -devices_init (6,615,510 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - - - -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - - - -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (31,692,824 samples, 0.06%) - - - -[[kernel.kallsyms]] (11,169,708 samples, 0.02%) - - - -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) - - - -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - - - -[[kernel.kallsyms]] (5,721,684 samples, 0.01%) - - - -[[kernel.kallsyms]] (7,311,966,115 samples, 13.01%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - - - -[[kernel.kallsyms]] (36,196,373 samples, 0.06%) - - - -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - - - -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - - - -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - - - -[[kernel.kallsyms]] (19,185,252 samples, 0.03%) - - - -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - - - -dsacache::Cache::SubmitTask (167,119,904 samples, 0.30%) - - - -dsacache::Cache::SubmitTask (291,889,749 samples, 0.52%) - - - -[anon] (52,831,798 samples, 0.09%) - - - -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - - - -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - - - -__GI___mmap64 (32,804,268 samples, 0.06%) - - - -[[kernel.kallsyms]] (8,424,524 samples, 0.01%) - - - -std::thread& std::vector<std::thread, std::allocator<std::thread> >::emplace_back<void (13,581,055 samples, 0.02%) - - - -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] - - -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,188,239 samples, 0.01%) - - - -[[kernel.kallsyms]] (13,025,872 samples, 0.02%) - - - -[[kernel.kallsyms]] (22,884,622 samples, 0.04%) - - - - diff --git a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-tca4-tcb1-tcj2-tmul8-wl4294967296-cs8388608.csv b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-tca4-tcb1-tcj2-tmul8-wl4294967296-cs8388608.csv new file mode 100644 index 0000000..e7f1789 --- /dev/null +++ b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-tca4-tcb1-tcj2-tmul8-wl4294967296-cs8388608.csv @@ -0,0 +1,6 @@ +run;rt-ns;rt-s;result[0];scana-run;scana-wait;scanb-run;scanb-wait;aggrj-run;aggrj-wait;cache-hr; +0;143486367;0.143486;830411852;6810012;164;239227808;8942;65938542;66963096;0.50293; +1;142057230;0.142057;830411852;7131075;253;394868379;8606;65626346;58540168;0.501953; +2;139312143;0.139312;830411852;6808012;175;355667149;7732;64904267;62958887;0.50293; +3;140484345;0.140484;830411852;6764746;176;264323098;8972;64246246;62007276;0.501953; +4;139969612;0.13997;830411852;6711176;185;160577686;9802;64592288;66658704;0.500977; diff --git a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg index 1d0dc86..b15b4ed 100644 --- a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg +++ b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg @@ -430,2112 +430,2896 @@ -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - +[[kernel.kallsyms]] (3,384,849 samples, 0.01%) + -[[kernel.kallsyms]] (52,582,122 samples, 0.09%) - +_IO_new_fclose (3,343,282 samples, 0.01%) + -[[kernel.kallsyms]] (33,581,259 samples, 0.06%) - +[[kernel.kallsyms]] (15,353,184 samples, 0.05%) + -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - +std::pair<unsigned char* const, dsacache::CacheData>::~pair (44,773,918 samples, 0.15%) + -[[kernel.kallsyms]] (15,074,171 samples, 0.03%) - +[[kernel.kallsyms]] (7,743,611 samples, 0.03%) + -[[kernel.kallsyms]] (8,457,723 samples, 0.02%) - +[[kernel.kallsyms]] (7,684,763 samples, 0.03%) + -__GI_mprotect (9,972,283 samples, 0.02%) - +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - +std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> >::allocate (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - +[[kernel.kallsyms]] (48,607,741 samples, 0.16%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[[kernel.kallsyms]] (6,364,338 samples, 0.02%) + -syscall (19,162,146 samples, 0.03%) - +[[kernel.kallsyms]] (48,159,760 samples, 0.16%) + -[[kernel.kallsyms]] (64,992,482 samples, 0.12%) - +__GI___fstatat64 (3,052,559 samples, 0.01%) + -__libc_start_main_impl (47,034,233,602 samples, 83.69%) -__libc_start_main_impl +void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::destroy<std::pair<unsigned char* const, dsacache::CacheData> > (44,773,918 samples, 0.15%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (28,777,470 samples, 0.10%) + -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (2,875,827 samples, 0.01%) + -[[kernel.kallsyms]] (14,696,092 samples, 0.03%) - +__fopen_internal (8,519,108 samples, 0.03%) + -dsacache::CacheData::~CacheData (27,978,244,770 samples, 49.78%) -dsacache::CacheData::~CacheData +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (76,042,767 samples, 0.25%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (1,224,461,441 samples, 2.18%) -u.. +dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (147,238,622 samples, 0.49%) + -[[kernel.kallsyms]] (19,939,586 samples, 0.04%) - +[[kernel.kallsyms]] (3,384,849 samples, 0.01%) + -[[kernel.kallsyms]] (78,676,124 samples, 0.14%) - +[[kernel.kallsyms]] (3,837,971 samples, 0.01%) + -auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (17,068,240 samples, 0.06%) + -[[kernel.kallsyms]] (58,263,661 samples, 0.10%) - +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + -[[kernel.kallsyms]] (9,614,253 samples, 0.02%) - +[[kernel.kallsyms]] (26,010,449 samples, 0.09%) + -dml_wait_busy_poll (27,883,144,753 samples, 49.61%) -dml_wait_busy_poll +allocate_stack (22,944,895 samples, 0.08%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +operator new (52,594,123 samples, 0.17%) + -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (3,052,559 samples, 0.01%) + -[[kernel.kallsyms]] (5,469,680 samples, 0.01%) - +[[kernel.kallsyms]] (11,976,927 samples, 0.04%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +_IO_new_file_fopen (10,442,407 samples, 0.03%) + -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[[kernel.kallsyms]] (19,296,338 samples, 0.06%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (8,778,281,021 samples, 15.62%) -unsigned long std::unifo.. +operator new (4,987,319 samples, 0.02%) + -device_parse (16,212,211 samples, 0.03%) - +[[kernel.kallsyms]] (4,940,101 samples, 0.02%) + -__GI_mprotect (50,154,117 samples, 0.09%) - +open (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (6,005,283,229 samples, 10.69%) -[[kernel.kallsy.. +[[kernel.kallsyms]] (37,531,108 samples, 0.12%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +[[kernel.kallsyms]] (26,010,449 samples, 0.09%) + -[[kernel.kallsyms]] (7,143,140 samples, 0.01%) - +[[kernel.kallsyms]] (22,007,083 samples, 0.07%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +__GI___mmap64 (48,607,741 samples, 0.16%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +clone3 (8,257,939,125 samples, 27.40%) +clone3 -[[kernel.kallsyms]] (26,297,644 samples, 0.05%) - +device_parse (24,925,539 samples, 0.08%) + -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -_int_malloc (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (15,934,374 samples, 0.05%) + -[[kernel.kallsyms]] (9,113,218 samples, 0.02%) - +std::_Vector_base<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_allocate (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +all (30,140,784,765 samples, 100%) + -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - +__GI_mprotect (67,580,074 samples, 0.22%) + -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (72,180,314 samples, 0.24%) + -[[kernel.kallsyms]] (8,004,974 samples, 0.01%) - +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (5,085,079,286 samples, 16.87%) +unsigned int std::uniform_.. -[[kernel.kallsyms]] (20,280,664 samples, 0.04%) - +[[kernel.kallsyms]] (7,079,005 samples, 0.02%) + -dml::core::dispatcher::hw_dispatcher::initialize_hw (23,965,836 samples, 0.04%) - +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (46,560,862 samples, 0.15%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +[[kernel.kallsyms]] (33,156,033 samples, 0.11%) + -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (50,149,163 samples, 0.09%) - +dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (4,898,824 samples, 0.02%) + -[[kernel.kallsyms]] (256,851,116 samples, 0.46%) - +dsacache::CacheData::WaitOnCompletion (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (7,604,051 samples, 0.01%) - +[[kernel.kallsyms]] (3,325,081 samples, 0.01%) + -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - +arena_get2 (4,987,319 samples, 0.02%) + -__GI__IO_file_open (17,566,875 samples, 0.03%) - +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - +[[kernel.kallsyms]] (6,202,515,929 samples, 20.58%) +[[kernel.kallsyms]] -[[kernel.kallsyms]] (14,391,559 samples, 0.03%) - +accfg_get_param_long (14,952,271 samples, 0.05%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (7,548,151,192 samples, 25.04%) +[[kernel.kallsyms]] -dml::core::dispatcher::hw_queue::initialize_new_queue (4,914,889 samples, 0.01%) - +[[kernel.kallsyms]] (8,300,659 samples, 0.03%) + -grow_heap (9,972,283 samples, 0.02%) - +dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (26,783,041 samples, 0.05%) - +[[kernel.kallsyms]] (2,963,362 samples, 0.01%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (2,595,937 samples, 0.01%) + -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (9,141,474 samples, 0.03%) + -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -[[kernel.kallsyms]] (18,034,607 samples, 0.03%) - +std::__detail::__waiter_pool::_M_do_wait (3,384,849 samples, 0.01%) + -dsacache::CacheData::WaitOnCompletion (27,884,008,166 samples, 49.61%) -dsacache::CacheData::WaitOnCompletion +_int_memalign (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (16,912,912 samples, 0.03%) - +__GI__IO_file_open (8,519,108 samples, 0.03%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +[[kernel.kallsyms]] (7,465,849 samples, 0.02%) + -_int_memalign (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (25,526,558 samples, 0.08%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (5,150,250,561 samples, 17.09%) +[[kernel.kallsyms]] -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - +[[kernel.kallsyms]] (3,638,434 samples, 0.01%) + -[[kernel.kallsyms]] (7,316,287,736 samples, 13.02%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (14,840,164 samples, 0.05%) + -scan_b (201,442,301 samples, 0.36%) - +[[kernel.kallsyms]] (3,679,421 samples, 0.01%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (37,433,031 samples, 0.12%) + -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) - +LT<unsigned long>::simd_filter (25,725,457 samples, 0.09%) + -dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (3,949,602 samples, 0.01%) + -[[kernel.kallsyms]] (24,636,857 samples, 0.04%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - +[[kernel.kallsyms]] (102,481,437 samples, 0.34%) + -_int_malloc (35,828,488 samples, 0.06%) - +std::vector<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_default_append (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (10,229,240 samples, 0.03%) + -sysmalloc (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +[[kernel.kallsyms]] (25,526,558 samples, 0.08%) + -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - +[dash] (4,123,565 samples, 0.01%) + -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - +operator new (3,304,446 samples, 0.01%) + -main (47,031,727,484 samples, 83.69%) -main +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -Filter<unsigned long, LT, (3,458,098,462 samples, 6.15%) -Filter<u.. +[[kernel.kallsyms]] (5,031,880 samples, 0.02%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + -[[kernel.kallsyms]] (14,838,521 samples, 0.03%) - +[dash] (4,123,565 samples, 0.01%) + -[[kernel.kallsyms]] (8,356,058 samples, 0.01%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (8,803,451,547 samples, 29.21%) +unsigned long std::uniform_int_distribution<un.. -accfg_wq_get_first (16,212,211 samples, 0.03%) - +dml::detail::ml::task<std::allocator<unsigned char> >::task (52,594,123 samples, 0.17%) + -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) - +[[kernel.kallsyms]] (3,733,096 samples, 0.01%) + -[[kernel.kallsyms]] (90,779,509 samples, 0.16%) - +[[kernel.kallsyms]] (6,848,234 samples, 0.02%) + -[[kernel.kallsyms]] (9,513,431 samples, 0.02%) - +__GI_munmap (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (15,074,171 samples, 0.03%) - +[[kernel.kallsyms]] (2,963,362 samples, 0.01%) + -__GI___nptl_deallocate_stack (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (3,663,627 samples, 0.01%) + -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - +[[kernel.kallsyms]] (7,684,763 samples, 0.03%) + -[[kernel.kallsyms]] (8,817,500 samples, 0.02%) - +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + -dsacache::Cache::ExecuteCopy (78,330,121 samples, 0.14%) - +[[kernel.kallsyms]] (7,684,763 samples, 0.03%) + -sysmalloc (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (8,771,525 samples, 0.03%) + -[[kernel.kallsyms]] (7,229,598 samples, 0.01%) - +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (6,370,691 samples, 0.01%) - +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + -[[kernel.kallsyms]] (179,023,692 samples, 0.32%) - +[anon] (49,577,154 samples, 0.16%) + -dsacache::Cache::Clear (27,978,244,770 samples, 49.78%) -dsacache::Cache::Clear +[[kernel.kallsyms]] (10,797,481 samples, 0.04%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +dsacache::Cache::AllocOnNode (102,664,219 samples, 0.34%) + -[[kernel.kallsyms]] (22,333,273 samples, 0.04%) - +[[kernel.kallsyms]] (4,226,865 samples, 0.01%) + -[[kernel.kallsyms]] (16,822,016 samples, 0.03%) - +[[kernel.kallsyms]] (1,984,884,174 samples, 6.59%) +[[kernel.. -[[kernel.kallsyms]] (10,378,720 samples, 0.02%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (7,674,953,624 samples, 25.46%) +unsigned long std::uniform_int_distribut.. -allocate_stack (12,724,581 samples, 0.02%) - +[[kernel.kallsyms]] (1,987,085,492 samples, 6.59%) +[[kernel.. -[[kernel.kallsyms]] (12,973,510 samples, 0.02%) - +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +[[kernel.kallsyms]] (2,963,362 samples, 0.01%) + -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - +__GI_munmap (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (12,330,911 samples, 0.02%) - +[[kernel.kallsyms]] (12,818,755 samples, 0.04%) + -aggr_j (2,905,466,537 samples, 5.17%) -aggr_j +[[kernel.kallsyms]] (9,968,198 samples, 0.03%) + -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - +[[kernel.kallsyms]] (68,850,041 samples, 0.23%) + -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (7,465,849 samples, 0.02%) + -numa_alloc_onnode (146,910,655 samples, 0.26%) - +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (21,152,354 samples, 0.04%) - +__GI___close_nocancel (3,343,282 samples, 0.01%) + -dml::detail::ml::task<std::allocator<unsigned char> >::task (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (4,087,959 samples, 0.01%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + -[[kernel.kallsyms]] (45,659,261 samples, 0.08%) - +[[kernel.kallsyms]] (6,869,793 samples, 0.02%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +__GI___libc_read (8,234,982 samples, 0.03%) + -[[kernel.kallsyms]] (7,104,752 samples, 0.01%) - +[[kernel.kallsyms]] (29,618,190 samples, 0.10%) + -_int_memalign (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (6,734,580 samples, 0.02%) + -[[kernel.kallsyms]] (7,169,816 samples, 0.01%) - +_mm512_mask_add_epi64 (381,802,722 samples, 1.27%) + -[[kernel.kallsyms]] (48,567,551 samples, 0.09%) - +_start (19,312,485,925 samples, 64.07%) +_start -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - +[[kernel.kallsyms]] (3,052,559 samples, 0.01%) + -auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) - +[[kernel.kallsyms]] (3,319,544 samples, 0.01%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (6,359,112 samples, 0.02%) + -[[kernel.kallsyms]] (60,893,681 samples, 0.11%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - +std::_Vector_base<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_allocate (4,987,319 samples, 0.02%) + -__nptl_free_stacks (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (26,010,449 samples, 0.09%) + -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - +[[kernel.kallsyms]] (3,628,447 samples, 0.01%) + -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_nodes (27,978,244,770 samples, 49.78%) -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. +Sum<unsigned long>::simd_agg (381,802,722 samples, 1.27%) + -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - +__nptl_free_stacks (10,627,702 samples, 0.04%) + -__GI___libc_read (6,155,254 samples, 0.01%) - +[[kernel.kallsyms]] (6,885,828 samples, 0.02%) + -__GI___mmap64 (93,967,068 samples, 0.17%) - +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + -mbind (26,878,702 samples, 0.05%) - +Vector_Loader<unsigned long, (3,787,754,318 samples, 12.57%) +Vector_Loader<unsi.. -[[kernel.kallsyms]] (12,902,294 samples, 0.02%) - +[[kernel.kallsyms]] (46,495,539 samples, 0.15%) + -void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::destroy<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) -void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<un.. +[[kernel.kallsyms]] (2,909,339 samples, 0.01%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (9,688,156 samples, 0.03%) + -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - +__libc_start_main_impl (19,310,456,147 samples, 64.07%) +__libc_start_main_impl -Aggregation<unsigned long, Sum, (2,493,804,416 samples, 4.44%) -Aggre.. +[[kernel.kallsyms]] (47,100,891 samples, 0.16%) + -[[kernel.kallsyms]] (49,739,189 samples, 0.09%) - +[[kernel.kallsyms]] (43,051,965 samples, 0.14%) + -[[kernel.kallsyms]] (54,854,853 samples, 0.10%) - +[[kernel.kallsyms]] (67,580,074 samples, 0.22%) + -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - +[[kernel.kallsyms]] (9,053,438 samples, 0.03%) + -[[kernel.kallsyms]] (43,636,433 samples, 0.08%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -numa_node_size64 (59,279,404 samples, 0.11%) - +[[kernel.kallsyms]] (3,679,421 samples, 0.01%) + -[[kernel.kallsyms]] (76,833,477 samples, 0.14%) - +[[kernel.kallsyms]] (22,969,731 samples, 0.08%) + -_mm512_stream_load_si512 (1,931,677,068 samples, 3.44%) -_mm.. +decltype (22,944,895 samples, 0.08%) + -[[stack]] (1,230,514,799 samples, 2.19%) -[.. +[[kernel.kallsyms]] (2,926,878 samples, 0.01%) + -[[kernel.kallsyms]] (10,494,303 samples, 0.02%) - +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (24,127,088 samples, 0.08%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (9,894,226 samples, 0.03%) + -[[kernel.kallsyms]] (24,405,965 samples, 0.04%) - +[[kernel.kallsyms]] (4,154,517 samples, 0.01%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +std::vector<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::resize (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (9,512,265 samples, 0.02%) - +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,594,123 samples, 0.17%) + -dsacache::Cache::Access (319,122,983 samples, 0.57%) - +_int_malloc (52,594,123 samples, 0.17%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +[[kernel.kallsyms]] (13,631,459 samples, 0.05%) + -[[kernel.kallsyms]] (78,924,962 samples, 0.14%) - +[dash] (3,194,438 samples, 0.01%) + -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - +_int_malloc (74,537,669 samples, 0.25%) + -[[kernel.kallsyms]] (22,459,290 samples, 0.04%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (32,804,268 samples, 0.06%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (12,108,425 samples, 0.02%) - +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + -[[kernel.kallsyms]] (256,323,849 samples, 0.46%) - +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) - +__fopen_internal (10,442,407 samples, 0.03%) + -__GI___libc_read (6,155,254 samples, 0.01%) - +[[kernel.kallsyms]] (40,468,083 samples, 0.13%) + -operator new (46,717,102 samples, 0.08%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -dml::core::hardware_device::submit (25,560,478 samples, 0.05%) - +[[kernel.kallsyms]] (18,749,682 samples, 0.06%) + -_int_malloc (45,810,524 samples, 0.08%) - +[[kernel.kallsyms]] (3,895,940 samples, 0.01%) + -[[kernel.kallsyms]] (14,391,559 samples, 0.03%) - +[[kernel.kallsyms]] (12,953,538 samples, 0.04%) + -std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::clear (27,978,244,770 samples, 49.78%) -std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*.. +dml::core::dispatcher::hw_dispatcher::initialize_hw (36,068,035 samples, 0.12%) + -dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) - +advise_stack_range (26,010,449 samples, 0.09%) + -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - +std::allocator_traits<std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::allocate (4,987,319 samples, 0.02%) + -[[kernel.kallsyms]] (15,189,233 samples, 0.03%) - +[[kernel.kallsyms]] (5,515,049 samples, 0.02%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +idxd_cdev_open (3,638,434 samples, 0.01%) + -__GI_munmap (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (46,730,723 samples, 0.16%) + -[[kernel.kallsyms]] (13,011,844 samples, 0.02%) - +[[kernel.kallsyms]] (3,851,113 samples, 0.01%) + -[[kernel.kallsyms]] (9,512,265 samples, 0.02%) - +std::vector<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::resize (4,987,319 samples, 0.02%) + -operator new (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (36,809,180 samples, 0.12%) + -[[kernel.kallsyms]] (91,643,182 samples, 0.16%) - +tcache_init (3,304,446 samples, 0.01%) + -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (67,470,988 samples, 0.12%) - +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + -[[kernel.kallsyms]] (7,929,584 samples, 0.01%) - +dml::detail::ml::task<std::allocator<unsigned char> >::task (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +__GI___mmap64 (3,319,544 samples, 0.01%) + -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (6,785,217 samples, 0.01%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - +add_wq (24,094,741 samples, 0.08%) + -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (7,030,285 samples, 0.01%) - +[[kernel.kallsyms]] (26,286,797 samples, 0.09%) + -[[kernel.kallsyms]] (7,450,133 samples, 0.01%) - +[[kernel.kallsyms]] (15,934,374 samples, 0.05%) + -accfg_get_param_long (9,811,793 samples, 0.02%) - +_int_new_arena (4,987,319 samples, 0.02%) + -[[kernel.kallsyms]] (29,377,009 samples, 0.05%) - +[[kernel.kallsyms]] (6,196,217,361 samples, 20.56%) +[[kernel.kallsyms]] -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + -[[kernel.kallsyms]] (11,362,923 samples, 0.02%) - +[[kernel.kallsyms]] (5,490,795 samples, 0.02%) + -_IO_new_file_underflow (6,564,990 samples, 0.01%) - +sysmalloc (52,594,123 samples, 0.17%) + -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - +[[kernel.kallsyms]] (31,245,532 samples, 0.10%) + -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - +[[kernel.kallsyms]] (48,607,741 samples, 0.16%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (37,433,031 samples, 0.12%) + -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (2,739,506 samples, 0.01%) + -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (999,184,562 samples, 3.32%) +uns.. -[[kernel.kallsyms]] (6,357,006 samples, 0.01%) - +__GI___mmap64 (48,281,164 samples, 0.16%) + -[[kernel.kallsyms]] (16,010,002 samples, 0.03%) - +_IO_new_file_underflow (15,317,698 samples, 0.05%) + -__libc_start_call_main (47,034,233,602 samples, 83.69%) -__libc_start_call_main +grow_heap (67,580,074 samples, 0.22%) + -dsacache::Cache::GetCacheNode (28,516,368 samples, 0.05%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -void fill_mt<unsigned long> (17,603,955,046 samples, 31.32%) -void fill_mt<unsigned long> +[[kernel.kallsyms]] (24,127,088 samples, 0.08%) + -auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) - +[[kernel.kallsyms]] (2,980,686 samples, 0.01%) + -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - +[[kernel.kallsyms]] (5,886,956 samples, 0.02%) + -__GI___libc_read (15,623,013 samples, 0.03%) - +sum_check (1,364,784,253 samples, 4.53%) +sum_c.. -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (46,717,102 samples, 0.08%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - +dsacache::Cache::SubmitTask (38,292,326 samples, 0.13%) + -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (1,980,065,646 samples, 6.57%) +[[kernel.. -_IO_new_fclose (8,909,079 samples, 0.02%) - +[[kernel.kallsyms]] (8,507,469 samples, 0.03%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[dash] (3,496,415 samples, 0.01%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +[[kernel.kallsyms]] (12,818,755 samples, 0.04%) + -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - +[[kernel.kallsyms]] (47,628,726 samples, 0.16%) + -[[kernel.kallsyms]] (155,318,192 samples, 0.28%) - +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (108,946,296 samples, 0.36%) + -add_wq (15,236,807 samples, 0.03%) - +[[stack]] (1,222,902,290 samples, 4.06%) +[[st.. -openat (6,835,870 samples, 0.01%) - +tcache_init (4,987,319 samples, 0.02%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + -std::thread::_M_start_thread (12,724,581 samples, 0.02%) - +[[kernel.kallsyms]] (7,414,650 samples, 0.02%) + -sysmalloc (45,810,524 samples, 0.08%) - +dsacache::Cache::ExecuteCopy (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (42,101,327 samples, 0.07%) - +[[kernel.kallsyms]] (26,010,449 samples, 0.09%) + -[[kernel.kallsyms]] (5,553,168 samples, 0.01%) - +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + -advise_stack_range (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -_int_memalign (46,717,102 samples, 0.08%) - +dml::core::hardware_device::submit (38,292,326 samples, 0.13%) + -[[kernel.kallsyms]] (88,186,671 samples, 0.16%) - +[[kernel.kallsyms]] (46,452,777 samples, 0.15%) + -dsacache::Cache::AllocOnNode (206,190,059 samples, 0.37%) - +[dash] (4,123,565 samples, 0.01%) + -syscall (52,197,849 samples, 0.09%) - +_int_malloc (108,543,048 samples, 0.36%) + -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - +sysmalloc (74,355,039 samples, 0.25%) + -_mm512_cmplt_epi64_mask (26,906,506 samples, 0.05%) - +grow_heap (69,459,330 samples, 0.23%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +std::__tree_barrier<NopStruct>::wait (4,934,427 samples, 0.02%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +scan_b (345,594,606 samples, 1.15%) + -[[kernel.kallsyms]] (5,869,681 samples, 0.01%) - +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + -[[kernel.kallsyms]] (7,323,204,071 samples, 13.03%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (3,393,787 samples, 0.01%) + -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - +[[kernel.kallsyms]] (3,087,033 samples, 0.01%) + -start_thread (6,580,772,960 samples, 11.71%) -start_thread +syscall (3,384,849 samples, 0.01%) + -dml::detail::ml::task<std::allocator<unsigned char> >::task (46,717,102 samples, 0.08%) - +[[kernel.kallsyms]] (19,296,338 samples, 0.06%) + -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (1,623,195,042 samples, 2.89%) -st.. +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (3,899,192 samples, 0.01%) + -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - +[[kernel.kallsyms]] (5,707,016 samples, 0.02%) + -[[kernel.kallsyms]] (16,480,213 samples, 0.03%) - +[[kernel.kallsyms]] (7,549,011,630 samples, 25.05%) +[[kernel.kallsyms]] -mbind (52,197,849 samples, 0.09%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (8,817,500 samples, 0.02%) - +[[kernel.kallsyms]] (18,726,527 samples, 0.06%) + -[[kernel.kallsyms]] (18,346,934 samples, 0.03%) - +[[kernel.kallsyms]] (6,205,810 samples, 0.02%) + -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - +[[kernel.kallsyms]] (8,278,132 samples, 0.03%) + -Sum<unsigned long>::simd_agg (414,838,602 samples, 0.74%) - +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (34,047,863 samples, 0.06%) - +[[kernel.kallsyms]] (3,035,802 samples, 0.01%) + -__GI_munmap (80,579,994 samples, 0.14%) - +[[kernel.kallsyms]] (20,513,121 samples, 0.07%) + -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - +syscall (15,934,374 samples, 0.05%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (4,330,637 samples, 0.01%) + -[[kernel.kallsyms]] (5,002,702,867 samples, 8.90%) -[[kernel.kal.. +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (52,594,123 samples, 0.17%) + -[[kernel.kallsyms]] (32,408,072 samples, 0.06%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (9,123,981 samples, 0.02%) - +[[kernel.kallsyms]] (52,345,705 samples, 0.17%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +dsacache::CacheData::~CacheData (44,773,918 samples, 0.15%) + -__GI_munmap (6,540,584 samples, 0.01%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,222,040,291 samples, 4.05%) +unsi.. -_int_malloc (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (5,028,875 samples, 0.02%) + -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -__GI___mmap64 (32,804,268 samples, 0.06%) - +operator new (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (85,434,447 samples, 0.15%) - +[[kernel.kallsyms]] (45,902,157 samples, 0.15%) + -Vector_Loader<unsigned long, (2,993,062,044 samples, 5.33%) -Vector.. +dsacache::Cache::GetCacheNode (18,425,267 samples, 0.06%) + -[[kernel.kallsyms]] (7,785,866 samples, 0.01%) - +[[kernel.kallsyms]] (8,278,132 samples, 0.03%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (6,847,267 samples, 0.02%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (11,918,533 samples, 0.04%) + -grow_heap (44,111,847 samples, 0.08%) - +[[kernel.kallsyms]] (67,094,839 samples, 0.22%) + -_mid_memalign (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + -_start (47,036,380,628 samples, 83.69%) -_start +auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - +[[kernel.kallsyms]] (9,688,156 samples, 0.03%) + -dsacache::CacheData::Deallocate (92,507,760 samples, 0.16%) - +[[kernel.kallsyms]] (10,259,033 samples, 0.03%) + -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + -[[kernel.kallsyms]] (8,551,434 samples, 0.02%) - +[[kernel.kallsyms]] (9,633,248 samples, 0.03%) + -[[kernel.kallsyms]] (5,543,739 samples, 0.01%) - +_mm512_stream_load_si512 (976,514,077 samples, 3.24%) +_mm.. -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - +main (19,307,949,140 samples, 64.06%) +main -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (2,724,961 samples, 0.01%) + -_IO_new_file_fopen (17,566,875 samples, 0.03%) - +[[kernel.kallsyms]] (6,653,708 samples, 0.02%) + -__fopen_internal (17,566,875 samples, 0.03%) - +[[kernel.kallsyms]] (45,523,836 samples, 0.15%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (1,226,127,583 samples, 2.18%) -u.. +[[kernel.kallsyms]] (15,934,374 samples, 0.05%) + -grow_heap (34,047,863 samples, 0.06%) - +tcache_init (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (10,376,330 samples, 0.02%) - +__GI___libc_read (9,834,968 samples, 0.03%) + -[[kernel.kallsyms]] (35,593,800 samples, 0.06%) - +[[kernel.kallsyms]] (24,127,088 samples, 0.08%) + -std::thread::thread<void (12,724,581 samples, 0.02%) - +[[kernel.kallsyms]] (24,201,250 samples, 0.08%) + -[[kernel.kallsyms]] (41,592,934 samples, 0.07%) - +[[kernel.kallsyms]] (43,290,481 samples, 0.14%) + -dsacache::Cache::Access (199,605,584 samples, 0.36%) - +__GI___mmap64 (3,319,544 samples, 0.01%) + -[[kernel.kallsyms]] (8,630,962 samples, 0.02%) - +_mid_memalign (108,946,296 samples, 0.36%) + -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (65,884,039 samples, 0.22%) + -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - +[dash] (4,123,565 samples, 0.01%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +queue_stack (10,627,702 samples, 0.04%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (7,465,849 samples, 0.02%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -_mid_memalign (10,813,731 samples, 0.02%) - +__GI___close_nocancel (2,810,815 samples, 0.01%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[[kernel.kallsyms]] (23,008,980 samples, 0.08%) + -dsacache::CacheData::WaitOnCompletion (27,883,144,753 samples, 49.61%) -dsacache::CacheData::WaitOnCompletion +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + -[[kernel.kallsyms]] (5,109,987 samples, 0.01%) - +QDPBench (30,111,044,709 samples, 99.90%) +QDPBench -[[kernel.kallsyms]] (17,265,048 samples, 0.03%) - +[[kernel.kallsyms]] (2,965,387 samples, 0.01%) + -dml::detail::ml::task<std::allocator<unsigned char> >::task (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (16,782,290 samples, 0.06%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +[[kernel.kallsyms]] (2,909,339 samples, 0.01%) + -__GI_munmap (92,507,760 samples, 0.16%) - +[[kernel.kallsyms]] (3,714,704 samples, 0.01%) + -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - +__GI___libc_read (8,234,982 samples, 0.03%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +void std::__detail::__platform_wait<int> (3,384,849 samples, 0.01%) + -[[kernel.kallsyms]] (9,919,156 samples, 0.02%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - +numa_node_size64 (27,653,358 samples, 0.09%) + -[[kernel.kallsyms]] (5,997,398 samples, 0.01%) - +_IO_new_file_close_it (3,343,282 samples, 0.01%) + -[[kernel.kallsyms]] (8,020,276 samples, 0.01%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - +[[kernel.kallsyms]] (25,526,558 samples, 0.08%) + -__GI___close_nocancel (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (7,548,151,192 samples, 25.04%) +[[kernel.kallsyms]] -QDPBench (56,173,802,251 samples, 99.95%) -QDPBench +[[kernel.kallsyms]] (4,678,190 samples, 0.02%) + -dml::core::dispatcher::hw_device::initialize_new_device (21,127,100 samples, 0.04%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > (44,773,918 samples, 0.15%) + -operator new (10,813,731 samples, 0.02%) - +dml::core::dispatcher::hw_queue::initialize_new_queue (11,142,496 samples, 0.04%) + -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (5,144,892,302 samples, 9.15%) -unsigned int .. +[[kernel.kallsyms]] (3,638,434 samples, 0.01%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (76,042,767 samples, 0.25%) + -[[kernel.kallsyms]] (8,400,747 samples, 0.01%) - +[[kernel.kallsyms]] (3,311,014 samples, 0.01%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[unknown] (1,234,157,360 samples, 2.20%) -[.. +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (72,843,903 samples, 0.24%) + -[[kernel.kallsyms]] (6,357,006 samples, 0.01%) - +_IO_new_file_close_it (2,810,815 samples, 0.01%) + -dml::detail::ml::impl::hardware::submit (25,560,478 samples, 0.05%) - +[[kernel.kallsyms]] (67,580,074 samples, 0.22%) + -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -_IO_new_file_close_it (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -LT<unsigned long>::simd_filter (26,906,506 samples, 0.05%) - +[[kernel.kallsyms]] (4,330,637 samples, 0.01%) + -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (1,015,021,080 samples, 1.81%) -u.. +[[kernel.kallsyms]] (20,165,201 samples, 0.07%) + -__GI___mmap64 (93,967,068 samples, 0.17%) - +[[kernel.kallsyms]] (3,052,559 samples, 0.01%) + -[[kernel.kallsyms]] (8,650,216 samples, 0.02%) - +[[kernel.kallsyms]] (8,278,378 samples, 0.03%) + -[[kernel.kallsyms]] (258,139,562 samples, 0.46%) - +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + -__libc_openat64 (6,835,870 samples, 0.01%) - +[[kernel.kallsyms]] (3,052,559 samples, 0.01%) + -queue_stack (6,540,584 samples, 0.01%) - +_IO_new_file_fopen (8,519,108 samples, 0.03%) + -[[kernel.kallsyms]] (11,040,629 samples, 0.02%) - +[[kernel.kallsyms]] (22,241,544 samples, 0.07%) + -[[kernel.kallsyms]] (183,101,380 samples, 0.33%) - +[[kernel.kallsyms]] (7,542,907,663 samples, 25.03%) +[[kernel.kallsyms]] -__libc_open64 (17,566,875 samples, 0.03%) - +aggr_j (1,889,465,270 samples, 6.27%) +aggr_j -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + -[[kernel.kallsyms]] (5,302,335 samples, 0.01%) - +[[kernel.kallsyms]] (8,396,004 samples, 0.03%) + -[[kernel.kallsyms]] (11,866,122 samples, 0.02%) - +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (46,717,102 samples, 0.08%) - +__GI_madvise (26,010,449 samples, 0.09%) + -_mm512_mask_add_epi64 (414,838,602 samples, 0.74%) - +[[kernel.kallsyms]] (15,929,571 samples, 0.05%) + -std::pair<unsigned char* const, dsacache::CacheData>::~pair (27,978,244,770 samples, 49.78%) -std::pair<unsigned char* const, dsacache::CacheData>::~pair +[[kernel.kallsyms]] (7,538,487 samples, 0.03%) + -[[kernel.kallsyms]] (33,581,259 samples, 0.06%) - +[[kernel.kallsyms]] (53,257,964 samples, 0.18%) + -[[kernel.kallsyms]] (5,360,812 samples, 0.01%) - +[[kernel.kallsyms]] (6,885,828 samples, 0.02%) + -[[kernel.kallsyms]] (32,112,996 samples, 0.06%) - +accfg_get_param_str (6,647,561 samples, 0.02%) + -[[kernel.kallsyms]] (16,791,097 samples, 0.03%) - +openat (11,628,987 samples, 0.04%) + -[[kernel.kallsyms]] (6,799,118 samples, 0.01%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,247,495,041 samples, 4.14%) +unsi.. -sudo (16,406,659 samples, 0.03%) - +[[kernel.kallsyms]] (3,318,867 samples, 0.01%) + -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_node (27,978,244,770 samples, 49.78%) -std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -wqs_init (16,212,211 samples, 0.03%) - +__libc_open64 (10,442,407 samples, 0.03%) + -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (47,812,091 samples, 0.16%) + -[[kernel.kallsyms]] (25,918,503 samples, 0.05%) - +[[kernel.kallsyms]] (37,531,108 samples, 0.12%) + -[[kernel.kallsyms]] (38,027,005 samples, 0.07%) - +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +[[kernel.kallsyms]] (33,160,854 samples, 0.11%) + -_mid_memalign (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (3,699,146 samples, 0.01%) + -[[kernel.kallsyms]] (16,791,097 samples, 0.03%) - +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (108,946,296 samples, 0.36%) + -operator new (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (2,963,362 samples, 0.01%) + -std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::clear (27,978,244,770 samples, 49.78%) -std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheD.. +[[kernel.kallsyms]] (25,526,558 samples, 0.08%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +[[kernel.kallsyms]] (6,791,218 samples, 0.02%) + -dsacache::CacheData::WaitOnCompletion (84,126,226 samples, 0.15%) - +__GI_mprotect (47,572,366 samples, 0.16%) + -dsacache::Cache::GetCacheNode (23,082,624 samples, 0.04%) - +[[kernel.kallsyms]] (46,730,723 samples, 0.16%) + -[[kernel.kallsyms]] (93,216,145 samples, 0.17%) - +__GI___libc_read (9,834,968 samples, 0.03%) + -dsacache::Cache::SubmitTask (25,560,478 samples, 0.05%) - +__GI___getdelim (12,180,826 samples, 0.04%) + -void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) -void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + -__GI___libc_read (15,623,013 samples, 0.03%) - +[[kernel.kallsyms]] (68,850,041 samples, 0.23%) + -[[kernel.kallsyms]] (89,913,865 samples, 0.16%) - +[[kernel.kallsyms]] (5,691,293 samples, 0.02%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -Vector_Loader<unsigned long, (1,931,677,068 samples, 3.44%) -Vec.. +[dash] (4,123,565 samples, 0.01%) + -[[kernel.kallsyms]] (11,089,814 samples, 0.02%) - +std::barrier<NopStruct>::arrive_and_wait (4,934,427 samples, 0.02%) + -__GI_madvise (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (3,343,282 samples, 0.01%) + -syscall (26,878,702 samples, 0.05%) - +[[kernel.kallsyms]] (66,115,525 samples, 0.22%) + -[[kernel.kallsyms]] (16,071,115 samples, 0.03%) - +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (1,619,966,571 samples, 5.37%) +std::m.. -[[kernel.kallsyms]] (19,961,046 samples, 0.04%) - +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + -_mid_memalign (46,717,102 samples, 0.08%) - +__GI___libc_malloc (3,304,446 samples, 0.01%) + -syscall (25,315,610 samples, 0.05%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,594,123 samples, 0.17%) + -[[kernel.kallsyms]] (19,107,386 samples, 0.03%) - +__libc_openat64 (11,628,987 samples, 0.04%) + -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - +[[kernel.kallsyms]] (3,320,393 samples, 0.01%) + -decltype (12,724,581 samples, 0.02%) - +void std::allocator_traits<std::allocator<std::thread> >::construct<std::thread, void (22,944,895 samples, 0.08%) + -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (24,127,088 samples, 0.08%) + -[[kernel.kallsyms]] (13,514,538 samples, 0.02%) - +[[kernel.kallsyms]] (45,889,320 samples, 0.15%) + -sum_check (1,371,232,114 samples, 2.44%) -su.. +[[kernel.kallsyms]] (7,465,849 samples, 0.02%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + -__GI___getdelim (22,226,508 samples, 0.04%) - +[[kernel.kallsyms]] (6,205,810 samples, 0.02%) + -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - +[[kernel.kallsyms]] (9,894,226 samples, 0.03%) + -[[kernel.kallsyms]] (27,648,058 samples, 0.05%) - +std::__new_allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> >::allocate (3,304,446 samples, 0.01%) + -sysmalloc (10,813,731 samples, 0.02%) - +_int_memalign (75,333,655 samples, 0.25%) + -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - +operator new (108,946,296 samples, 0.36%) + -[[kernel.kallsyms]] (5,740,414 samples, 0.01%) - +arena_get2 (4,987,319 samples, 0.02%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (7,679,240,842 samples, 13.66%) -unsigned long std::u.. +[[kernel.kallsyms]] (68,850,041 samples, 0.23%) + -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (25,430,139 samples, 0.05%) - +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (7,850,046 samples, 0.03%) + -[[kernel.kallsyms]] (6,347,448 samples, 0.01%) - +__GI___mmap64 (48,281,164 samples, 0.16%) + -clone3 (6,606,257,445 samples, 11.75%) -clone3 +[[kernel.kallsyms]] (25,322,052 samples, 0.08%) + -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - +__sysfs_read_attr (4,992,336 samples, 0.02%) + -__pthread_create_2_1 (12,724,581 samples, 0.02%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (20,727,544 samples, 0.04%) - +[dash] (4,123,565 samples, 0.01%) + -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (3,899,192 samples, 0.01%) + -[[kernel.kallsyms]] (30,153,610 samples, 0.05%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_node (44,773,918 samples, 0.15%) + -[[kernel.kallsyms]] (93,967,068 samples, 0.17%) - +[[kernel.kallsyms]] (7,414,650 samples, 0.02%) + -auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (46,717,102 samples, 0.08%) - +[[kernel.kallsyms]] (37,463,632 samples, 0.12%) + -dml::core::dispatcher::hw_dispatcher::get_instance (25,315,981 samples, 0.05%) - +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + -[[kernel.kallsyms]] (7,344,388 samples, 0.01%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -__GI_mprotect (34,047,863 samples, 0.06%) - +mbind (37,463,632 samples, 0.12%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +[[kernel.kallsyms]] (22,098,821 samples, 0.07%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +[[kernel.kallsyms]] (6,961,023 samples, 0.02%) + -__GI_mprotect (44,111,847 samples, 0.08%) - +[[kernel.kallsyms]] (1,066,528,744 samples, 3.54%) +[[k.. -numa_node_size64 (17,944,285 samples, 0.03%) - +[[kernel.kallsyms]] (7,414,650 samples, 0.02%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +[libstdc++.so.6.0.32] (8,189,417,539 samples, 27.17%) +[libstdc++.so.6.0.32] -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - +__GI__IO_doallocbuf (5,482,730 samples, 0.02%) + -scan_a (3,462,002,037 samples, 6.16%) -scan_a +[[kernel.kallsyms]] (44,996,270 samples, 0.15%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + -[[kernel.kallsyms]] (47,784,777 samples, 0.09%) - +sh (6,316,890 samples, 0.02%) + -[[kernel.kallsyms]] (11,243,448 samples, 0.02%) - +arch_fork (3,899,192 samples, 0.01%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) - +grow_heap (47,572,366 samples, 0.16%) + -[[kernel.kallsyms]] (9,861,578 samples, 0.02%) - +std::vector<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_default_append (4,987,319 samples, 0.02%) + -dsacache::Cache::ExecuteCopy (49,871,202 samples, 0.09%) - +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -grow_heap (50,154,117 samples, 0.09%) - +std::__new_allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> >::allocate (4,987,319 samples, 0.02%) + -[[kernel.kallsyms]] (24,140,143 samples, 0.04%) - +date (2,801,126 samples, 0.01%) + -[[kernel.kallsyms]] (8,647,063 samples, 0.02%) - +[[kernel.kallsyms]] (9,014,446 samples, 0.03%) + -[[kernel.kallsyms]] (7,318,881,921 samples, 13.02%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (24,185,381 samples, 0.08%) + -[[kernel.kallsyms]] (6,540,584 samples, 0.01%) - +[[kernel.kallsyms]] (4,552,570 samples, 0.02%) + -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (4,015,533,329 samples, 7.14%) -std::mers.. +[[kernel.kallsyms]] (3,837,971 samples, 0.01%) + -[[kernel.kallsyms]] (10,376,330 samples, 0.02%) - +_int_memalign (52,594,123 samples, 0.17%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +sysmalloc (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (9,861,578 samples, 0.02%) - +auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (147,238,622 samples, 0.49%) + -[libstdc++.so.6.0.32] (6,568,910,875 samples, 11.69%) -[libstdc++.so.6.0.. +Vector_Loader<unsigned long, (976,514,077 samples, 3.24%) +Vec.. -[[kernel.kallsyms]] (5,197,845 samples, 0.01%) - +__GI_munmap (10,627,702 samples, 0.04%) + -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -[[kernel.kallsyms]] (11,778,485 samples, 0.02%) - +[unknown] (1,249,321,349 samples, 4.14%) +[unk.. -numa_alloc_onnode (59,682,970 samples, 0.11%) - +[[kernel.kallsyms]] (47,572,366 samples, 0.16%) + -[[kernel.kallsyms]] (45,789,329 samples, 0.08%) - +[[kernel.kallsyms]] (9,237,862 samples, 0.03%) + -[[kernel.kallsyms]] (50,154,117 samples, 0.09%) - +[[kernel.kallsyms]] (14,437,602 samples, 0.05%) + -dml::core::dispatcher::hw_dispatcher::hw_dispatcher (23,965,836 samples, 0.04%) - +[[kernel.kallsyms]] (9,894,226 samples, 0.03%) + -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - +[[kernel.kallsyms]] (7,457,397,998 samples, 24.74%) +[[kernel.kallsyms]] -_int_memalign (10,813,731 samples, 0.02%) - +[[kernel.kallsyms]] (3,318,867 samples, 0.01%) + -__sysfs_device_parse (16,212,211 samples, 0.03%) - +[[kernel.kallsyms]] (3,899,192 samples, 0.01%) + -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - +grow_heap (102,946,189 samples, 0.34%) + -[[kernel.kallsyms]] (258,083,965 samples, 0.46%) - +[[kernel.kallsyms]] (37,433,031 samples, 0.12%) + -[[kernel.kallsyms]] (5,333,154 samples, 0.01%) - +[[kernel.kallsyms]] (37,433,031 samples, 0.12%) + -[[kernel.kallsyms]] (5,542,336 samples, 0.01%) - +[[kernel.kallsyms]] (9,894,226 samples, 0.03%) + -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +sysmalloc (107,684,660 samples, 0.36%) + -[[kernel.kallsyms]] (7,240,609,876 samples, 12.88%) -[[kernel.kallsyms]] +dml::detail::ml::impl::hardware::submit (38,292,326 samples, 0.13%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +dml::core::dispatcher::hw_device::initialize_new_device (36,068,035 samples, 0.12%) + -dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) - +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (76,042,767 samples, 0.25%) + -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (52,769,643 samples, 0.09%) - +std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::clear (45,636,194 samples, 0.15%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + -[[kernel.kallsyms]] (80,579,994 samples, 0.14%) - +__GI__IO_doallocbuf (5,482,730 samples, 0.02%) + -[[kernel.kallsyms]] (13,726,772 samples, 0.02%) - +[dash] (3,496,415 samples, 0.01%) + -[[kernel.kallsyms]] (4,860,205 samples, 0.01%) - +[[kernel.kallsyms]] (3,899,192 samples, 0.01%) + -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (33,198,896 samples, 0.06%) - +void std::__detail::__waiter<std::integral_constant<bool, true> >::_M_do_wait<std::__tree_barrier<NopStruct>::wait (4,199,137 samples, 0.01%) + -[[kernel.kallsyms]] (10,757,457 samples, 0.02%) - +[[kernel.kallsyms]] (3,035,802 samples, 0.01%) + -[[kernel.kallsyms]] (31,728,822 samples, 0.06%) - +[[kernel.kallsyms]] (3,884,301 samples, 0.01%) + -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - +[[kernel.kallsyms]] (34,361,612 samples, 0.11%) + -[[kernel.kallsyms]] (82,999,130 samples, 0.15%) - +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +[[kernel.kallsyms]] (51,193,258 samples, 0.17%) + -[[kernel.kallsyms]] (4,837,589 samples, 0.01%) - +dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (108,946,296 samples, 0.36%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +[[kernel.kallsyms]] (4,330,637 samples, 0.01%) + -[[kernel.kallsyms]] (25,484,485 samples, 0.05%) - +[[kernel.kallsyms]] (28,944,481 samples, 0.10%) + -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (10,813,731 samples, 0.02%) - +__GI_mprotect (102,946,189 samples, 0.34%) + -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - +[[kernel.kallsyms]] (67,580,074 samples, 0.22%) + -__GI___getdelim (7,608,450 samples, 0.01%) - +std::_Vector_base<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_allocate (3,304,446 samples, 0.01%) + -dsacache::Cache::AllocOnNode (77,976,052 samples, 0.14%) - +__GI_munmap (56,689,460 samples, 0.19%) + -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get (27,883,144,753 samples, 49.61%) -dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get +[[kernel.kallsyms]] (7,743,611 samples, 0.03%) + -[[kernel.kallsyms]] (13,547,223 samples, 0.02%) - +tcache_init (4,987,319 samples, 0.02%) + -[[kernel.kallsyms]] (256,851,116 samples, 0.46%) - +__libc_open64 (8,519,108 samples, 0.03%) + -[[kernel.kallsyms]] (89,913,865 samples, 0.16%) - +_mid_memalign (52,594,123 samples, 0.17%) + -void std::allocator_traits<std::allocator<std::thread> >::construct<std::thread, void (12,724,581 samples, 0.02%) - +dml::detail::ml::task<std::allocator<unsigned char> >::task (76,042,767 samples, 0.25%) + -std::thread::join (6,839,397 samples, 0.01%) - +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + -[[kernel.kallsyms]] (19,162,146 samples, 0.03%) - +[[kernel.kallsyms]] (3,384,849 samples, 0.01%) + -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - +[[kernel.kallsyms]] (25,862,590 samples, 0.09%) + -__pthread_clockjoin_ex (6,839,397 samples, 0.01%) - +[[kernel.kallsyms]] (24,757,757 samples, 0.08%) + -_mm512_stream_load_si512 (2,993,062,044 samples, 5.33%) -_mm512.. +_int_malloc (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (12,973,510 samples, 0.02%) - +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (76,042,767 samples, 0.25%) + -dml::detail::ml::task<std::allocator<unsigned char> >::task (52,769,643 samples, 0.09%) - +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + -[[kernel.kallsyms]] (90,779,509 samples, 0.16%) - +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + -[[kernel.kallsyms]] (14,066,357 samples, 0.03%) - +[[kernel.kallsyms]] (1,953,796,617 samples, 6.48%) +[[kernel.. -[[kernel.kallsyms]] (15,623,013 samples, 0.03%) - +[[kernel.kallsyms]] (7,684,763 samples, 0.03%) + -[[kernel.kallsyms]] (25,315,610 samples, 0.05%) - +[[kernel.kallsyms]] (4,520,112 samples, 0.01%) + -[[kernel.kallsyms]] (11,870,294 samples, 0.02%) - +dsacache::Cache::Access (275,442,985 samples, 0.91%) + -sh (5,618,491 samples, 0.01%) - +[[kernel.kallsyms]] (37,463,632 samples, 0.12%) + -[[kernel.kallsyms]] (6,000,961,265 samples, 10.68%) -[[kernel.kallsy.. +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +__GI___mmap64 (48,607,741 samples, 0.16%) + -_IO_new_file_underflow (19,746,317 samples, 0.04%) - +[[kernel.kallsyms]] (48,607,741 samples, 0.16%) + -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + -[[kernel.kallsyms]] (43,636,433 samples, 0.08%) - +_int_memalign (108,543,048 samples, 0.36%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (46,730,723 samples, 0.16%) + -dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (46,717,102 samples, 0.08%) - +__GI__IO_file_open (10,442,407 samples, 0.03%) + -[[kernel.kallsyms]] (52,197,849 samples, 0.09%) - +operator new (72,172,962 samples, 0.24%) + -[[kernel.kallsyms]] (23,722,633 samples, 0.04%) - +[[kernel.kallsyms]] (1,114,937,523 samples, 3.70%) +[[ke.. -[[kernel.kallsyms]] (8,649,622 samples, 0.02%) - +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + -all (56,200,887,845 samples, 100%) - +[[kernel.kallsyms]] (3,733,096 samples, 0.01%) + + + +dsacache::Cache::Clear (45,636,194 samples, 0.15%) + + + +_IO_new_fclose (2,810,815 samples, 0.01%) + + + +std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::find (2,594,606 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,847,267 samples, 0.02%) + + + +[[kernel.kallsyms]] (47,619,043 samples, 0.16%) + + + +[[kernel.kallsyms]] (7,544,275,261 samples, 25.03%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (7,414,650 samples, 0.02%) + + + +syscall (12,818,755 samples, 0.04%) + + + +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + + + +[[kernel.kallsyms]] (19,152,020 samples, 0.06%) + + + +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + + + +[[kernel.kallsyms]] (22,481,351 samples, 0.07%) + + + +__libc_open64 (3,327,938 samples, 0.01%) + + + +auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (108,946,296 samples, 0.36%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (3,985,773,817 samples, 13.22%) +std::mersenne_twiste.. + + +[[kernel.kallsyms]] (66,900,060 samples, 0.22%) + + + +[[kernel.kallsyms]] (2,913,068 samples, 0.01%) + + + +[[kernel.kallsyms]] (4,330,637 samples, 0.01%) + + + +[[kernel.kallsyms]] (37,463,632 samples, 0.12%) + + + +[[kernel.kallsyms]] (37,463,632 samples, 0.12%) + + + +[[kernel.kallsyms]] (3,899,192 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + + + +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + + + +[[kernel.kallsyms]] (7,079,005 samples, 0.02%) + + + +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + + + +[[kernel.kallsyms]] (2,739,506 samples, 0.01%) + + + +[[kernel.kallsyms]] (46,383,137 samples, 0.15%) + + + +start_thread (8,220,408,017 samples, 27.27%) +start_thread + + +dml::core::dispatcher::hw_dispatcher::hw_dispatcher (36,068,035 samples, 0.12%) + + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (108,946,296 samples, 0.36%) + + + +[[kernel.kallsyms]] (3,733,096 samples, 0.01%) + + + +[[kernel.kallsyms]] (1,972,678,394 samples, 6.54%) +[[kernel.. + + +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + + + +[[kernel.kallsyms]] (37,477,240 samples, 0.12%) + + + +std::thread& std::vector<std::thread, std::allocator<std::thread> >::emplace_back<void (24,662,729 samples, 0.08%) + + + +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,848,234 samples, 0.02%) + + + +[[kernel.kallsyms]] (8,485,010 samples, 0.03%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (72,172,962 samples, 0.24%) + + + +[[kernel.kallsyms]] (2,621,394 samples, 0.01%) + + + +[[kernel.kallsyms]] (56,125,463 samples, 0.19%) + + + +__libc_start_main_impl (4,123,565 samples, 0.01%) + + + +[[kernel.kallsyms]] (9,141,474 samples, 0.03%) + + + +__GI___getdelim (16,992,392 samples, 0.06%) + + + +sudo (17,914,694 samples, 0.06%) + + + +[[kernel.kallsyms]] (3,847,851 samples, 0.01%) + + + +[[kernel.kallsyms]] (39,469,767 samples, 0.13%) + + + +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + + + +[[kernel.kallsyms]] (67,574,391 samples, 0.22%) + + + +__sysfs_device_parse (24,925,539 samples, 0.08%) + + + +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + + + +[[kernel.kallsyms]] (36,849,356 samples, 0.12%) + + + +[[kernel.kallsyms]] (7,414,650 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,847,267 samples, 0.02%) + + + +[[kernel.kallsyms]] (3,384,849 samples, 0.01%) + + + +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + + + +[[kernel.kallsyms]] (5,028,875 samples, 0.02%) + + + +__pthread_clockjoin_ex (11,424,709 samples, 0.04%) + + + +std::_Vector_base<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul>, std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::_M_allocate (4,987,319 samples, 0.02%) + + + +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + + + +_mm512_stream_load_si512 (3,787,754,318 samples, 12.57%) +_mm512_stream_load.. + + +std::thread::thread<void (22,944,895 samples, 0.08%) + + + +[[kernel.kallsyms]] (62,792,726 samples, 0.21%) + + + +alloc_new_heap (4,987,319 samples, 0.02%) + + + +__GI_mprotect (69,459,330 samples, 0.23%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (52,594,123 samples, 0.17%) + + + +[[kernel.kallsyms]] (4,401,293 samples, 0.01%) + + + +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + + + +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + + + +[[kernel.kallsyms]] (3,837,971 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,631,112 samples, 0.03%) + + + +std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::find (2,594,606 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,538,962,676 samples, 25.01%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (47,628,726 samples, 0.16%) + + + +[[kernel.kallsyms]] (20,424,864 samples, 0.07%) + + + +[[kernel.kallsyms]] (7,079,005 samples, 0.02%) + + + +__GI___libc_malloc (4,987,319 samples, 0.02%) + + + +alloc_new_heap (3,304,446 samples, 0.01%) + + + +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + + + +[[kernel.kallsyms]] (47,628,726 samples, 0.16%) + + + +[[kernel.kallsyms]] (4,472,549 samples, 0.01%) + + + +[[kernel.kallsyms]] (3,384,849 samples, 0.01%) + + + +[[kernel.kallsyms]] (65,452,356 samples, 0.22%) + + + +[[kernel.kallsyms]] (3,699,146 samples, 0.01%) + + + +[[kernel.kallsyms]] (9,894,226 samples, 0.03%) + + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (72,172,962 samples, 0.24%) + + + +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,755,668 samples, 0.02%) + + + +mbind (26,286,797 samples, 0.09%) + + + +arena_get2 (3,304,446 samples, 0.01%) + + + +[[kernel.kallsyms]] (20,389,388 samples, 0.07%) + + + +numa_node_size64 (34,113,977 samples, 0.11%) + + + +[[kernel.kallsyms]] (6,683,955 samples, 0.02%) + + + +[[kernel.kallsyms]] (3,052,559 samples, 0.01%) + + + +dsacache::Cache::GetCacheNode (14,523,628 samples, 0.05%) + + + +dsacache::Cache::Access (344,524,710 samples, 1.14%) + + + +syscall (37,463,632 samples, 0.12%) + + + +[[kernel.kallsyms]] (13,797,838 samples, 0.05%) + + + +[[kernel.kallsyms]] (3,851,807 samples, 0.01%) + + + +dsacache::Cache::GetFromCache (5,544,863 samples, 0.02%) + + + +[[kernel.kallsyms]] (3,699,146 samples, 0.01%) + + + +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_nodes (45,636,194 samples, 0.15%) + + + +dsacache::Cache::SubmitTask (320,402,788 samples, 1.06%) + + + +[[kernel.kallsyms]] (3,714,704 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,628,987 samples, 0.04%) + + + +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + + + +dml::core::dispatcher::hw_dispatcher::get_instance (36,899,252 samples, 0.12%) + + + +scan_a (5,953,797,536 samples, 19.75%) +scan_a + + +[[kernel.kallsyms]] (6,847,267 samples, 0.02%) + + + +[[kernel.kallsyms]] (4,895,709 samples, 0.02%) + + + +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (72,172,962 samples, 0.24%) + + + +[[kernel.kallsyms]] (12,818,755 samples, 0.04%) + + + +void std::__atomic_wait_address<std::__barrier_phase_t, std::__tree_barrier<NopStruct>::wait (4,854,984 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,710,845 samples, 0.03%) + + + +[[kernel.kallsyms]] (1,977,950,190 samples, 6.56%) +[[kernel.. + + +void fill_mt<unsigned long> (17,803,043,695 samples, 59.07%) +void fill_mt<unsigned long> + + +[[kernel.kallsyms]] (11,918,533 samples, 0.04%) + + + +[[kernel.kallsyms]] (8,358,946 samples, 0.03%) + + + +_mm512_cmplt_epi64_mask (25,725,457 samples, 0.09%) + + + +[[kernel.kallsyms]] (68,850,041 samples, 0.23%) + + + +[[kernel.kallsyms]] (15,207,374 samples, 0.05%) + + + +[[kernel.kallsyms]] (13,082,986 samples, 0.04%) + + + +_mid_memalign (72,172,962 samples, 0.24%) + + + +[[kernel.kallsyms]] (4,006,642 samples, 0.01%) + + + +[[kernel.kallsyms]] (1,988,864,167 samples, 6.60%) +[[kernel.. + + +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + + + +[[kernel.kallsyms]] (3,319,544 samples, 0.01%) + + + +std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> >::allocate (4,987,319 samples, 0.02%) + + + +[[kernel.kallsyms]] (20,060,419 samples, 0.07%) + + + +[[kernel.kallsyms]] (2,918,739 samples, 0.01%) + + + +dml::detail::ml::task<std::allocator<unsigned char> >::task (108,946,296 samples, 0.36%) + + + +[[kernel.kallsyms]] (6,590,776 samples, 0.02%) + + + +[[kernel.kallsyms]] (24,127,088 samples, 0.08%) + + + +[[kernel.kallsyms]] (4,726,175 samples, 0.02%) + + + +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + + + +[[kernel.kallsyms]] (4,592,888 samples, 0.02%) + + + +[[kernel.kallsyms]] (2,600,889 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,538,487 samples, 0.03%) + + + +[[kernel.kallsyms]] (48,281,164 samples, 0.16%) + + + +wqs_init (24,925,539 samples, 0.08%) + + + +[[kernel.kallsyms]] (6,359,112 samples, 0.02%) + + + +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + + + +__GI___nptl_deallocate_stack (10,627,702 samples, 0.04%) + + + +[[kernel.kallsyms]] (2,933,926 samples, 0.01%) + + + +accfg_wq_get_state (4,992,336 samples, 0.02%) + + + +dsacache::Cache::ExecuteCopy (147,238,622 samples, 0.49%) + + + +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + + + +std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::clear (45,636,194 samples, 0.15%) + + + +[[kernel.kallsyms]] (4,592,888 samples, 0.02%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (108,946,296 samples, 0.36%) + + + +[[kernel.kallsyms]] (9,437,055 samples, 0.03%) + + + +[[kernel.kallsyms]] (3,319,544 samples, 0.01%) + + + +__libc_open64 (4,472,549 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,549,011,630 samples, 25.05%) +[[kernel.kallsyms]] + + +numa_alloc_onnode (86,456,066 samples, 0.29%) + + + +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + + + +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (52,594,123 samples, 0.17%) + + + +[[kernel.kallsyms]] (4,802,028 samples, 0.02%) + + + +__libc_start_call_main (19,310,456,147 samples, 64.07%) +__libc_start_call_main + + +[[kernel.kallsyms]] (43,912,672 samples, 0.15%) + + + +__GI__Fork (3,899,192 samples, 0.01%) + + + +syscall (26,286,797 samples, 0.09%) + + + +[[kernel.kallsyms]] (6,848,234 samples, 0.02%) + + + +[[kernel.kallsyms]] (2,963,362 samples, 0.01%) + + + +[[kernel.kallsyms]] (2,810,815 samples, 0.01%) + + + +__pthread_create_2_1 (22,944,895 samples, 0.08%) + + + +[[kernel.kallsyms]] (4,330,637 samples, 0.01%) + + + +[[kernel.kallsyms]] (10,627,702 samples, 0.04%) + + + +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + + + +[[kernel.kallsyms]] (9,834,968 samples, 0.03%) + + + +numa_alloc_onnode (75,010,861 samples, 0.25%) + + + +[[kernel.kallsyms]] (89,429,025 samples, 0.30%) + + + +_mid_memalign (76,042,767 samples, 0.25%) + + + +std::thread::join (11,424,709 samples, 0.04%) + + + +[[kernel.kallsyms]] (8,278,132 samples, 0.03%) + + + +[[kernel.kallsyms]] (3,319,544 samples, 0.01%) + + + +[[kernel.kallsyms]] (1,988,023,274 samples, 6.60%) +[[kernel.. + + +[[kernel.kallsyms]] (14,680,016 samples, 0.05%) + + + +[[kernel.kallsyms]] (8,234,982 samples, 0.03%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (76,042,767 samples, 0.25%) + + + +[[kernel.kallsyms]] (25,500,960 samples, 0.08%) + + + +[[kernel.kallsyms]] (4,546,338 samples, 0.02%) + + + +[[kernel.kallsyms]] (3,327,938 samples, 0.01%) + + + +[[kernel.kallsyms]] (102,946,189 samples, 0.34%) + + + +[[kernel.kallsyms]] (9,141,854 samples, 0.03%) + + + +std::allocator_traits<std::allocator<std::array<std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >, 3ul> > >::allocate (3,304,446 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,376,046 samples, 0.02%) + + + +accfg_wq_get_first (24,925,539 samples, 0.08%) + + + +Filter<unsigned long, LT, (5,946,231,867 samples, 19.73%) +Filter<unsigned long, LT, + + +numactl (2,657,805 samples, 0.01%) + + + +[[kernel.kallsyms]] (1,138,323,806 samples, 3.78%) +[[ke.. + + +[[kernel.kallsyms]] (7,538,487 samples, 0.03%) + + + +[[kernel.kallsyms]] (45,889,320 samples, 0.15%) + + + +arena_get2 (3,304,446 samples, 0.01%) + + + +[[kernel.kallsyms]] (67,580,074 samples, 0.22%) + -devices_init (6,615,510 samples, 0.01%) - +Aggregation<unsigned long, Sum, (1,512,909,229 samples, 5.02%) +Aggreg.. -[[kernel.kallsyms]] (6,052,541 samples, 0.01%) - +[[kernel.kallsyms]] (37,531,108 samples, 0.12%) + -[[kernel.kallsyms]] (6,835,870 samples, 0.01%) - +[[kernel.kallsyms]] (8,326,133 samples, 0.03%) + -[[kernel.kallsyms]] (44,111,847 samples, 0.08%) - +[[kernel.kallsyms]] (5,560,385 samples, 0.02%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +[[kernel.kallsyms]] (5,171,798 samples, 0.02%) + -[[kernel.kallsyms]] (31,692,824 samples, 0.06%) - +[[kernel.kallsyms]] (6,340,786 samples, 0.02%) + -[[kernel.kallsyms]] (11,169,708 samples, 0.02%) - +dsacache::CacheData::WaitOnCompletion (91,425,402 samples, 0.30%) + -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) - +[[kernel.kallsyms]] (67,580,074 samples, 0.22%) + -[[kernel.kallsyms]] (6,066,302 samples, 0.01%) - +std::thread::_M_start_thread (22,944,895 samples, 0.08%) + -[[kernel.kallsyms]] (5,721,684 samples, 0.01%) - +_int_new_arena (3,304,446 samples, 0.01%) + -[[kernel.kallsyms]] (7,311,966,115 samples, 13.01%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (10,199,949 samples, 0.03%) + -[[kernel.kallsyms]] (92,507,760 samples, 0.16%) - +[[kernel.kallsyms]] (2,600,889 samples, 0.01%) + -[[kernel.kallsyms]] (36,196,373 samples, 0.06%) - +[[kernel.kallsyms]] (24,142,755 samples, 0.08%) + -[[kernel.kallsyms]] (15,896,933 samples, 0.03%) - +[[kernel.kallsyms]] (85,062,631 samples, 0.28%) + -[[kernel.kallsyms]] (56,894,061 samples, 0.10%) - +_IO_new_file_underflow (10,331,560 samples, 0.03%) + -[[kernel.kallsyms]] (259,363,157 samples, 0.46%) - +[[kernel.kallsyms]] (5,107,024 samples, 0.02%) + -[[kernel.kallsyms]] (19,185,252 samples, 0.03%) - +__GI__IO_file_doallocate (5,482,730 samples, 0.02%) + -[[kernel.kallsyms]] (9,972,283 samples, 0.02%) - +__libc_start_call_main (4,123,565 samples, 0.01%) + -dsacache::Cache::SubmitTask (167,119,904 samples, 0.30%) - +dsacache::Cache::SubmitTask (251,550,889 samples, 0.83%) + -dsacache::Cache::SubmitTask (291,889,749 samples, 0.52%) - +[[kernel.kallsyms]] (8,278,132 samples, 0.03%) + -[anon] (52,831,798 samples, 0.09%) - +dsacache::Cache::AllocOnNode (120,570,043 samples, 0.40%) + -[[kernel.kallsyms]] (6,774,676 samples, 0.01%) - +[[kernel.kallsyms]] (3,750,747 samples, 0.01%) + -[[kernel.kallsyms]] (26,878,702 samples, 0.05%) - +[[kernel.kallsyms]] (38,745,731 samples, 0.13%) + -__GI___mmap64 (32,804,268 samples, 0.06%) - +[[kernel.kallsyms]] (48,607,741 samples, 0.16%) + -[[kernel.kallsyms]] (8,424,524 samples, 0.01%) - +[[kernel.kallsyms]] (7,684,763 samples, 0.03%) + -std::thread& std::vector<std::thread, std::allocator<std::thread> >::emplace_back<void (13,581,055 samples, 0.02%) - +[[kernel.kallsyms]] (56,689,460 samples, 0.19%) + -[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) -[[kernel.kallsyms]] +[[kernel.kallsyms]] (37,374,812 samples, 0.12%) + -[[kernel.kallsyms]] (13,838,204 samples, 0.02%) - +std::barrier<NopStruct>::wait (4,934,427 samples, 0.02%) + -[[kernel.kallsyms]] (5,188,239 samples, 0.01%) - +[[kernel.kallsyms]] (4,208,454 samples, 0.01%) + -[[kernel.kallsyms]] (13,025,872 samples, 0.02%) - +[[kernel.kallsyms]] (5,794,169 samples, 0.02%) + -[[kernel.kallsyms]] (22,884,622 samples, 0.04%) - +[[kernel.kallsyms]] (7,693,404 samples, 0.03%) +