diff --git a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-strongwait-perf.svg b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-strongwait-perf.svg new file mode 100644 index 0000000..2e1f318 --- /dev/null +++ b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-strongwait-perf.svg @@ -0,0 +1,901 @@ + + + + + + + + + + + + + + +Flame Graph + +Reset Zoom +Search +ic + + + +[[kernel.kallsyms]] (164,773,295 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,264,017,816 samples, 0.83%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (1,700,492,276 samples, 0.22%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +__GI___mmap64 (116,704,759 samples, 0.02%) + + + +[[kernel.kallsyms]] (98,455,521 samples, 0.01%) + + + +[[kernel.kallsyms]] (114,352,088 samples, 0.02%) + + + +dml_wait_busy_poll (552,915,159,655 samples, 72.96%) +dml_wait_busy_poll + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (8,803,762,515 samples, 1.16%) + + + +[[kernel.kallsyms]] (114,699,631 samples, 0.02%) + + + +syscall (100,069,976 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,633,978,792 samples, 1.01%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +[[kernel.kallsyms]] (98,455,521 samples, 0.01%) + + + +scan_a (3,561,308,091 samples, 0.47%) + + + +[[kernel.kallsyms]] (91,913,626 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,640,030,208 samples, 1.01%) + + + +[[kernel.kallsyms]] (346,481,323 samples, 0.05%) + + + +__libc_start_call_main (17,898,607,643 samples, 2.36%) +_.. + + +[[kernel.kallsyms]] (69,787,492 samples, 0.01%) + + + +[[kernel.kallsyms]] (86,066,528 samples, 0.01%) + + + +[[kernel.kallsyms]] (166,112,140 samples, 0.02%) + + + +[[kernel.kallsyms]] (77,375,623 samples, 0.01%) + + + +[[kernel.kallsyms]] (343,998,424 samples, 0.05%) + + + +[libstdc++.so.6.0.32] (557,603,884,217 samples, 73.58%) +[libstdc++.so.6.0.32] + + +Sum<unsigned long>::simd_agg (362,439,246 samples, 0.05%) + + + +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get (552,916,888,398 samples, 72.96%) +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get + + +_mm512_stream_load_si512 (615,603,507 samples, 0.08%) + + + +[[kernel.kallsyms]] (99,241,564 samples, 0.01%) + + + +[[kernel.kallsyms]] (116,704,759 samples, 0.02%) + + + +[[kernel.kallsyms]] (191,939,483 samples, 0.03%) + + + +[[kernel.kallsyms]] (110,991,220 samples, 0.01%) + + + +[[kernel.kallsyms]] (129,110,077 samples, 0.02%) + + + +start_thread (557,605,749,020 samples, 73.58%) +start_thread + + +__GI_mprotect (115,105,071 samples, 0.02%) + + + +aggr_j (554,037,578,817 samples, 73.11%) +aggr_j + + +[[kernel.kallsyms]] (347,561,693 samples, 0.05%) + + + +[[kernel.kallsyms]] (176,008,215 samples, 0.02%) + + + +dsacache::CacheData::WaitOnCompletion (552,916,888,398 samples, 72.96%) +dsacache::CacheData::WaitOnCompletion + + +[[kernel.kallsyms]] (114,699,631 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,543,227,017 samples, 1.00%) + + + +_mm512_mask_add_epi64 (362,439,246 samples, 0.05%) + + + +QDPBench (757,788,785,986 samples, 100.00%) +QDPBench + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (7,725,758,978 samples, 1.02%) + + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,175,539,845 samples, 0.16%) + + + +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (5,226,322,414 samples, 0.69%) + + + +[[kernel.kallsyms]] (115,105,071 samples, 0.02%) + + + +[[kernel.kallsyms]] (145,397,537 samples, 0.02%) + + + +[[kernel.kallsyms]] (253,156,329 samples, 0.03%) + + + +[[kernel.kallsyms]] (181,506,971 samples, 0.02%) + + + +dsacache::CacheData::WaitOnCompletion (552,919,479,804 samples, 72.96%) +dsacache::CacheData::WaitOnCompletion + + +[[kernel.kallsyms]] (348,415,844 samples, 0.05%) + + + +[[kernel.kallsyms]] (344,498,559 samples, 0.05%) + + + +[[kernel.kallsyms]] (188,486,660 samples, 0.02%) + + + +[[kernel.kallsyms]] (170,827,651 samples, 0.02%) + + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,249,330,771 samples, 0.16%) + + + +[[kernel.kallsyms]] (343,489,960 samples, 0.05%) + + + +main (17,898,607,643 samples, 2.36%) +m.. + + +[[kernel.kallsyms]] (7,632,251,240 samples, 1.01%) + + + +[[kernel.kallsyms]] (180,634,037 samples, 0.02%) + + + +[[kernel.kallsyms]] (71,765,486 samples, 0.01%) + + + +[[kernel.kallsyms]] (230,646,684 samples, 0.03%) + + + +[[kernel.kallsyms]] (97,231,694 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,642,626,030 samples, 1.01%) + + + +all (757,816,916,904 samples, 100%) + + + +[[kernel.kallsyms]] (115,836,601 samples, 0.02%) + + + +[[kernel.kallsyms]] (114,158,567 samples, 0.02%) + + + +Filter<unsigned long, LT, (3,559,948,229 samples, 0.47%) + + + +[[stack]] (1,180,363,823 samples, 0.16%) + + + +[[kernel.kallsyms]] (185,073,337 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,223,135,053 samples, 0.69%) + + + +[[kernel.kallsyms]] (346,513,089 samples, 0.05%) + + + +[[kernel.kallsyms]] (163,513,075 samples, 0.02%) + + + +[[kernel.kallsyms]] (114,537,443 samples, 0.02%) + + + +Vector_Loader<unsigned long, (3,012,574,731 samples, 0.40%) + + + +[[kernel.kallsyms]] (104,729,407 samples, 0.01%) + + + +[[kernel.kallsyms]] (98,831,700 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,258,830,703 samples, 0.83%) + + + +[[kernel.kallsyms]] (114,699,631 samples, 0.02%) + + + +[[kernel.kallsyms]] (124,808,299 samples, 0.02%) + + + +[[kernel.kallsyms]] (108,026,851 samples, 0.01%) + + + +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (949,996,590 samples, 0.13%) + + + +main (1,309,211,173 samples, 0.17%) + + + +void fill_mt<unsigned long> (17,898,607,643 samples, 2.36%) +v.. + + +[[kernel.kallsyms]] (179,769,803 samples, 0.02%) + + + +[[kernel.kallsyms]] (111,528,792 samples, 0.01%) + + + +[[kernel.kallsyms]] (100,647,350 samples, 0.01%) + + + +[[kernel.kallsyms]] (250,139,930 samples, 0.03%) + + + +[[kernel.kallsyms]] (7,638,302,454 samples, 1.01%) + + + +[[kernel.kallsyms]] (97,990,880 samples, 0.01%) + + + +Aggregation<unsigned long, Sum, (1,111,031,257 samples, 0.15%) + + + +sum_check (1,295,796,749 samples, 0.17%) + + + +[[kernel.kallsyms]] (7,640,895,829 samples, 1.01%) + + + +__GI___mmap64 (116,704,759 samples, 0.02%) + + + +[[kernel.kallsyms]] (339,109,648 samples, 0.04%) + + + +[[kernel.kallsyms]] (70,085,427 samples, 0.01%) + + + +[[kernel.kallsyms]] (192,731,474 samples, 0.03%) + + + +[[kernel.kallsyms]] (88,390,565 samples, 0.01%) + + + +__GI_munmap (182,533,535 samples, 0.02%) + + + +[[kernel.kallsyms]] (176,361,920 samples, 0.02%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +Vector_Loader<unsigned long, (615,603,507 samples, 0.08%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (4,108,353,659 samples, 0.54%) + + + +[[kernel.kallsyms]] (97,991,797 samples, 0.01%) + + + +[[kernel.kallsyms]] (71,178,655 samples, 0.01%) + + + +_mm512_stream_load_si512 (3,012,574,731 samples, 0.40%) + + + +[[kernel.kallsyms]] (181,571,536 samples, 0.02%) + + + +[[kernel.kallsyms]] (105,272,333 samples, 0.01%) + + + +[unknown] (1,358,752,193 samples, 0.18%) + + + +[[kernel.kallsyms]] (115,401,998 samples, 0.02%) + + + +[[kernel.kallsyms]] (110,122,437 samples, 0.01%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +[[kernel.kallsyms]] (182,533,535 samples, 0.02%) + + + +[[kernel.kallsyms]] (105,922,725 samples, 0.01%) + + + +[[kernel.kallsyms]] (92,658,484 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,642,626,030 samples, 1.01%) + + + + diff --git a/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg new file mode 100644 index 0000000..1d0dc86 --- /dev/null +++ b/qdp_project/evaluation-results/qdp-xeonmax-simple-prefetch-weakwait-perf.svg @@ -0,0 +1,2541 @@ + + + + + + + + + + + + + + +Flame Graph + +Reset Zoom +Search +ic + + + +[[kernel.kallsyms]] (13,838,204 samples, 0.02%) + + + +[[kernel.kallsyms]] (52,582,122 samples, 0.09%) + + + +[[kernel.kallsyms]] (33,581,259 samples, 0.06%) + + + +[[kernel.kallsyms]] (8,650,216 samples, 0.02%) + + + +[[kernel.kallsyms]] (15,074,171 samples, 0.03%) + + + +[[kernel.kallsyms]] (8,457,723 samples, 0.02%) + + + +__GI_mprotect (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (44,111,847 samples, 0.08%) + + + +[[kernel.kallsyms]] (5,997,398 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +syscall (19,162,146 samples, 0.03%) + + + +[[kernel.kallsyms]] (64,992,482 samples, 0.12%) + + + +__libc_start_main_impl (47,034,233,602 samples, 83.69%) +__libc_start_main_impl + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (14,696,092 samples, 0.03%) + + + +dsacache::CacheData::~CacheData (27,978,244,770 samples, 49.78%) +dsacache::CacheData::~CacheData + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,224,461,441 samples, 2.18%) +u.. + + +[[kernel.kallsyms]] (19,939,586 samples, 0.04%) + + + +[[kernel.kallsyms]] (78,676,124 samples, 0.14%) + + + +auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (58,263,661 samples, 0.10%) + + + +[[kernel.kallsyms]] (9,614,253 samples, 0.02%) + + + +dml_wait_busy_poll (27,883,144,753 samples, 49.61%) +dml_wait_busy_poll + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,469,680 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +[[kernel.kallsyms]] (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (8,778,281,021 samples, 15.62%) +unsigned long std::unifo.. + + +device_parse (16,212,211 samples, 0.03%) + + + +__GI_mprotect (50,154,117 samples, 0.09%) + + + +[[kernel.kallsyms]] (6,005,283,229 samples, 10.69%) +[[kernel.kallsy.. + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,143,140 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +[[kernel.kallsyms]] (26,297,644 samples, 0.05%) + + + +[[kernel.kallsyms]] (259,363,157 samples, 0.46%) + + + +_int_malloc (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (9,113,218 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (80,579,994 samples, 0.14%) + + + +[[kernel.kallsyms]] (7,344,388 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,004,974 samples, 0.01%) + + + +[[kernel.kallsyms]] (20,280,664 samples, 0.04%) + + + +dml::core::dispatcher::hw_dispatcher::initialize_hw (23,965,836 samples, 0.04%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (50,149,163 samples, 0.09%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (256,851,116 samples, 0.46%) + + + +[[kernel.kallsyms]] (7,604,051 samples, 0.01%) + + + +[[kernel.kallsyms]] (34,047,863 samples, 0.06%) + + + +__GI__IO_file_open (17,566,875 samples, 0.03%) + + + +[[kernel.kallsyms]] (93,967,068 samples, 0.17%) + + + +[[kernel.kallsyms]] (14,391,559 samples, 0.03%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +dml::core::dispatcher::hw_queue::initialize_new_queue (4,914,889 samples, 0.01%) + + + +grow_heap (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (26,783,041 samples, 0.05%) + + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,344,388 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,650,216 samples, 0.02%) + + + +[[kernel.kallsyms]] (18,034,607 samples, 0.03%) + + + +dsacache::CacheData::WaitOnCompletion (27,884,008,166 samples, 49.61%) +dsacache::CacheData::WaitOnCompletion + + +[[kernel.kallsyms]] (16,912,912 samples, 0.03%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +_int_memalign (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (50,154,117 samples, 0.09%) + + + +[[kernel.kallsyms]] (7,316,287,736 samples, 13.02%) +[[kernel.kallsyms]] + + +scan_b (201,442,301 samples, 0.36%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) + + + +dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (24,636,857 samples, 0.04%) + + + +[[kernel.kallsyms]] (5,197,845 samples, 0.01%) + + + +_int_malloc (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +sysmalloc (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +[[kernel.kallsyms]] (19,162,146 samples, 0.03%) + + + +[[kernel.kallsyms]] (34,047,863 samples, 0.06%) + + + +[[kernel.kallsyms]] (25,918,503 samples, 0.05%) + + + +main (47,031,727,484 samples, 83.69%) +main + + +Filter<unsigned long, LT, (3,458,098,462 samples, 6.15%) +Filter<u.. + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (14,838,521 samples, 0.03%) + + + +[[kernel.kallsyms]] (8,356,058 samples, 0.01%) + + + +accfg_wq_get_first (16,212,211 samples, 0.03%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (46,717,102 samples, 0.08%) + + + +[[kernel.kallsyms]] (90,779,509 samples, 0.16%) + + + +[[kernel.kallsyms]] (9,513,431 samples, 0.02%) + + + +[[kernel.kallsyms]] (15,074,171 samples, 0.03%) + + + +__GI___nptl_deallocate_stack (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (15,623,013 samples, 0.03%) + + + +[[kernel.kallsyms]] (8,817,500 samples, 0.02%) + + + +dsacache::Cache::ExecuteCopy (78,330,121 samples, 0.14%) + + + +sysmalloc (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (7,229,598 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,370,691 samples, 0.01%) + + + +[[kernel.kallsyms]] (179,023,692 samples, 0.32%) + + + +dsacache::Cache::Clear (27,978,244,770 samples, 49.78%) +dsacache::Cache::Clear + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +[[kernel.kallsyms]] (22,333,273 samples, 0.04%) + + + +[[kernel.kallsyms]] (16,822,016 samples, 0.03%) + + + +[[kernel.kallsyms]] (10,378,720 samples, 0.02%) + + + +allocate_stack (12,724,581 samples, 0.02%) + + + +[[kernel.kallsyms]] (12,973,510 samples, 0.02%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +[[kernel.kallsyms]] (32,804,268 samples, 0.06%) + + + +[[kernel.kallsyms]] (12,330,911 samples, 0.02%) + + + +aggr_j (2,905,466,537 samples, 5.17%) +aggr_j + + +[[kernel.kallsyms]] (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (32,804,268 samples, 0.06%) + + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +numa_alloc_onnode (146,910,655 samples, 0.26%) + + + +[[kernel.kallsyms]] (21,152,354 samples, 0.04%) + + + +dml::detail::ml::task<std::allocator<unsigned char> >::task (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (45,659,261 samples, 0.08%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,104,752 samples, 0.01%) + + + +_int_memalign (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (7,169,816 samples, 0.01%) + + + +[[kernel.kallsyms]] (48,567,551 samples, 0.09%) + + + +[[kernel.kallsyms]] (52,197,849 samples, 0.09%) + + + +auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (60,893,681 samples, 0.11%) + + + +[[kernel.kallsyms]] (80,579,994 samples, 0.14%) + + + +__nptl_free_stacks (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (259,363,157 samples, 0.46%) + + + +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_nodes (27,978,244,770 samples, 49.78%) +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. + + +[[kernel.kallsyms]] (50,154,117 samples, 0.09%) + + + +__GI___libc_read (6,155,254 samples, 0.01%) + + + +__GI___mmap64 (93,967,068 samples, 0.17%) + + + +mbind (26,878,702 samples, 0.05%) + + + +[[kernel.kallsyms]] (12,902,294 samples, 0.02%) + + + +void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::destroy<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) +void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<un.. + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (13,838,204 samples, 0.02%) + + + +Aggregation<unsigned long, Sum, (2,493,804,416 samples, 4.44%) +Aggre.. + + +[[kernel.kallsyms]] (49,739,189 samples, 0.09%) + + + +[[kernel.kallsyms]] (54,854,853 samples, 0.10%) + + + +[[kernel.kallsyms]] (44,111,847 samples, 0.08%) + + + +[[kernel.kallsyms]] (43,636,433 samples, 0.08%) + + + +numa_node_size64 (59,279,404 samples, 0.11%) + + + +[[kernel.kallsyms]] (76,833,477 samples, 0.14%) + + + +_mm512_stream_load_si512 (1,931,677,068 samples, 3.44%) +_mm.. + + +[[stack]] (1,230,514,799 samples, 2.19%) +[.. + + +[[kernel.kallsyms]] (10,494,303 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +[[kernel.kallsyms]] (24,405,965 samples, 0.04%) + + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (9,512,265 samples, 0.02%) + + + +dsacache::Cache::Access (319,122,983 samples, 0.57%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +[[kernel.kallsyms]] (78,924,962 samples, 0.14%) + + + +[[kernel.kallsyms]] (93,967,068 samples, 0.17%) + + + +[[kernel.kallsyms]] (22,459,290 samples, 0.04%) + + + +[[kernel.kallsyms]] (32,804,268 samples, 0.06%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (12,108,425 samples, 0.02%) + + + +[[kernel.kallsyms]] (256,323,849 samples, 0.46%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) + + + +__GI___libc_read (6,155,254 samples, 0.01%) + + + +operator new (46,717,102 samples, 0.08%) + + + +dml::core::hardware_device::submit (25,560,478 samples, 0.05%) + + + +_int_malloc (45,810,524 samples, 0.08%) + + + +[[kernel.kallsyms]] (14,391,559 samples, 0.03%) + + + +std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::clear (27,978,244,770 samples, 49.78%) +std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*.. + + +dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) + + + +[[kernel.kallsyms]] (5,197,845 samples, 0.01%) + + + +[[kernel.kallsyms]] (15,189,233 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +__GI_munmap (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (13,011,844 samples, 0.02%) + + + +[[kernel.kallsyms]] (9,512,265 samples, 0.02%) + + + +operator new (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (91,643,182 samples, 0.16%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (67,470,988 samples, 0.12%) + + + +[[kernel.kallsyms]] (7,929,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,344,388 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,785,217 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,650,216 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,344,388 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,030,285 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,450,133 samples, 0.01%) + + + +accfg_get_param_long (9,811,793 samples, 0.02%) + + + +[[kernel.kallsyms]] (29,377,009 samples, 0.05%) + + + +[[kernel.kallsyms]] (19,162,146 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,362,923 samples, 0.02%) + + + +_IO_new_file_underflow (6,564,990 samples, 0.01%) + + + +[[kernel.kallsyms]] (50,154,117 samples, 0.09%) + + + +[[kernel.kallsyms]] (6,774,676 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (6,357,006 samples, 0.01%) + + + +[[kernel.kallsyms]] (16,010,002 samples, 0.03%) + + + +__libc_start_call_main (47,034,233,602 samples, 83.69%) +__libc_start_call_main + + +dsacache::Cache::GetCacheNode (28,516,368 samples, 0.05%) + + + +void fill_mt<unsigned long> (17,603,955,046 samples, 31.32%) +void fill_mt<unsigned long> + + +auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (49,871,202 samples, 0.09%) + + + +[[kernel.kallsyms]] (13,838,204 samples, 0.02%) + + + +__GI___libc_read (15,623,013 samples, 0.03%) + + + +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (46,717,102 samples, 0.08%) + + + +[[kernel.kallsyms]] (26,878,702 samples, 0.05%) + + + +[[kernel.kallsyms]] (26,878,702 samples, 0.05%) + + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +_IO_new_fclose (8,909,079 samples, 0.02%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +[[kernel.kallsyms]] (80,579,994 samples, 0.14%) + + + +[[kernel.kallsyms]] (155,318,192 samples, 0.28%) + + + +add_wq (15,236,807 samples, 0.03%) + + + +openat (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +std::thread::_M_start_thread (12,724,581 samples, 0.02%) + + + +sysmalloc (45,810,524 samples, 0.08%) + + + +[[kernel.kallsyms]] (42,101,327 samples, 0.07%) + + + +[[kernel.kallsyms]] (5,553,168 samples, 0.01%) + + + +advise_stack_range (11,778,485 samples, 0.02%) + + + +_int_memalign (46,717,102 samples, 0.08%) + + + +[[kernel.kallsyms]] (88,186,671 samples, 0.16%) + + + +dsacache::Cache::AllocOnNode (206,190,059 samples, 0.37%) + + + +syscall (52,197,849 samples, 0.09%) + + + +[[kernel.kallsyms]] (34,047,863 samples, 0.06%) + + + +_mm512_cmplt_epi64_mask (26,906,506 samples, 0.05%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,869,681 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,323,204,071 samples, 13.03%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (8,650,216 samples, 0.02%) + + + +start_thread (6,580,772,960 samples, 11.71%) +start_thread + + +dml::detail::ml::task<std::allocator<unsigned char> >::task (46,717,102 samples, 0.08%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (1,623,195,042 samples, 2.89%) +st.. + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (50,154,117 samples, 0.09%) + + + +[[kernel.kallsyms]] (16,480,213 samples, 0.03%) + + + +mbind (52,197,849 samples, 0.09%) + + + +[[kernel.kallsyms]] (10,757,457 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,997,398 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,817,500 samples, 0.02%) + + + +[[kernel.kallsyms]] (18,346,934 samples, 0.03%) + + + +[[kernel.kallsyms]] (25,918,503 samples, 0.05%) + + + +Sum<unsigned long>::simd_agg (414,838,602 samples, 0.74%) + + + +[[kernel.kallsyms]] (34,047,863 samples, 0.06%) + + + +__GI_munmap (80,579,994 samples, 0.14%) + + + +[[kernel.kallsyms]] (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,002,702,867 samples, 8.90%) +[[kernel.kal.. + + +[[kernel.kallsyms]] (32,408,072 samples, 0.06%) + + + +[[kernel.kallsyms]] (9,123,981 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +__GI_munmap (6,540,584 samples, 0.01%) + + + +_int_malloc (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (44,111,847 samples, 0.08%) + + + +__GI___mmap64 (32,804,268 samples, 0.06%) + + + +[[kernel.kallsyms]] (85,434,447 samples, 0.15%) + + + +Vector_Loader<unsigned long, (2,993,062,044 samples, 5.33%) +Vector.. + + +[[kernel.kallsyms]] (7,785,866 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +grow_heap (44,111,847 samples, 0.08%) + + + +_mid_memalign (35,828,488 samples, 0.06%) + + + +_start (47,036,380,628 samples, 83.69%) +_start + + +[[kernel.kallsyms]] (25,918,503 samples, 0.05%) + + + +dsacache::CacheData::Deallocate (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (19,162,146 samples, 0.03%) + + + +[[kernel.kallsyms]] (8,551,434 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,543,739 samples, 0.01%) + + + +[[kernel.kallsyms]] (15,896,933 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +_IO_new_file_fopen (17,566,875 samples, 0.03%) + + + +__fopen_internal (17,566,875 samples, 0.03%) + + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (1,226,127,583 samples, 2.18%) +u.. + + +grow_heap (34,047,863 samples, 0.06%) + + + +[[kernel.kallsyms]] (10,376,330 samples, 0.02%) + + + +[[kernel.kallsyms]] (35,593,800 samples, 0.06%) + + + +std::thread::thread<void (12,724,581 samples, 0.02%) + + + +[[kernel.kallsyms]] (41,592,934 samples, 0.07%) + + + +dsacache::Cache::Access (199,605,584 samples, 0.36%) + + + +[[kernel.kallsyms]] (8,630,962 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +[[kernel.kallsyms]] (10,757,457 samples, 0.02%) + + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +_mid_memalign (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +dsacache::CacheData::WaitOnCompletion (27,883,144,753 samples, 49.61%) +dsacache::CacheData::WaitOnCompletion + + +[[kernel.kallsyms]] (5,109,987 samples, 0.01%) + + + +[[kernel.kallsyms]] (17,265,048 samples, 0.03%) + + + +dml::detail::ml::task<std::allocator<unsigned char> >::task (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +__GI_munmap (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (93,967,068 samples, 0.17%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +[[kernel.kallsyms]] (9,919,156 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (5,997,398 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,020,276 samples, 0.01%) + + + +[[kernel.kallsyms]] (10,757,457 samples, 0.02%) + + + +__GI___close_nocancel (7,344,388 samples, 0.01%) + + + +QDPBench (56,173,802,251 samples, 99.95%) +QDPBench + + +dml::core::dispatcher::hw_device::initialize_new_device (21,127,100 samples, 0.04%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +operator new (10,813,731 samples, 0.02%) + + + +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (5,144,892,302 samples, 9.15%) +unsigned int .. + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (8,400,747 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +[unknown] (1,234,157,360 samples, 2.20%) +[.. + + +[[kernel.kallsyms]] (6,357,006 samples, 0.01%) + + + +dml::detail::ml::impl::hardware::submit (25,560,478 samples, 0.05%) + + + +[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) +[[kernel.kallsyms]] + + +_IO_new_file_close_it (7,344,388 samples, 0.01%) + + + +LT<unsigned long>::simd_filter (26,906,506 samples, 0.05%) + + + +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (1,015,021,080 samples, 1.81%) +u.. + + +__GI___mmap64 (93,967,068 samples, 0.17%) + + + +[[kernel.kallsyms]] (8,650,216 samples, 0.02%) + + + +[[kernel.kallsyms]] (258,139,562 samples, 0.46%) + + + +__libc_openat64 (6,835,870 samples, 0.01%) + + + +queue_stack (6,540,584 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,040,629 samples, 0.02%) + + + +[[kernel.kallsyms]] (183,101,380 samples, 0.33%) + + + +__libc_open64 (17,566,875 samples, 0.03%) + + + +[[kernel.kallsyms]] (26,878,702 samples, 0.05%) + + + +[[kernel.kallsyms]] (5,302,335 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,866,122 samples, 0.02%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (46,717,102 samples, 0.08%) + + + +_mm512_mask_add_epi64 (414,838,602 samples, 0.74%) + + + +std::pair<unsigned char* const, dsacache::CacheData>::~pair (27,978,244,770 samples, 49.78%) +std::pair<unsigned char* const, dsacache::CacheData>::~pair + + +[[kernel.kallsyms]] (33,581,259 samples, 0.06%) + + + +[[kernel.kallsyms]] (5,360,812 samples, 0.01%) + + + +[[kernel.kallsyms]] (32,112,996 samples, 0.06%) + + + +[[kernel.kallsyms]] (16,791,097 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,799,118 samples, 0.01%) + + + +sudo (16,406,659 samples, 0.03%) + + + +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_node (27,978,244,770 samples, 49.78%) +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pa.. + + +wqs_init (16,212,211 samples, 0.03%) + + + +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (25,918,503 samples, 0.05%) + + + +[[kernel.kallsyms]] (38,027,005 samples, 0.07%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +_mid_memalign (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (16,791,097 samples, 0.03%) + + + +operator new (52,769,643 samples, 0.09%) + + + +std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::clear (27,978,244,770 samples, 49.78%) +std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheD.. + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +dsacache::CacheData::WaitOnCompletion (84,126,226 samples, 0.15%) + + + +dsacache::Cache::GetCacheNode (23,082,624 samples, 0.04%) + + + +[[kernel.kallsyms]] (93,216,145 samples, 0.17%) + + + +dsacache::Cache::SubmitTask (25,560,478 samples, 0.05%) + + + +void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > (27,978,244,770 samples, 49.78%) +void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > + + +__GI___libc_read (15,623,013 samples, 0.03%) + + + +[[kernel.kallsyms]] (89,913,865 samples, 0.16%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +Vector_Loader<unsigned long, (1,931,677,068 samples, 3.44%) +Vec.. + + +[[kernel.kallsyms]] (11,089,814 samples, 0.02%) + + + +__GI_madvise (11,778,485 samples, 0.02%) + + + +syscall (26,878,702 samples, 0.05%) + + + +[[kernel.kallsyms]] (16,071,115 samples, 0.03%) + + + +[[kernel.kallsyms]] (19,961,046 samples, 0.04%) + + + +_mid_memalign (46,717,102 samples, 0.08%) + + + +syscall (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (19,107,386 samples, 0.03%) + + + +[[kernel.kallsyms]] (13,838,204 samples, 0.02%) + + + +decltype (12,724,581 samples, 0.02%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (13,514,538 samples, 0.02%) + + + +sum_check (1,371,232,114 samples, 2.44%) +su.. + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +__GI___getdelim (22,226,508 samples, 0.04%) + + + +[[kernel.kallsyms]] (5,197,845 samples, 0.01%) + + + +[[kernel.kallsyms]] (27,648,058 samples, 0.05%) + + + +sysmalloc (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (52,197,849 samples, 0.09%) + + + +[[kernel.kallsyms]] (5,740,414 samples, 0.01%) + + + +unsigned long std::uniform_int_distribution<unsigned long>::operator (7,679,240,842 samples, 13.66%) +unsigned long std::u.. + + +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,430,139 samples, 0.05%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (6,347,448 samples, 0.01%) + + + +clone3 (6,606,257,445 samples, 11.75%) +clone3 + + +[[kernel.kallsyms]] (26,878,702 samples, 0.05%) + + + +__pthread_create_2_1 (12,724,581 samples, 0.02%) + + + +[[kernel.kallsyms]] (20,727,544 samples, 0.04%) + + + +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (30,153,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (93,967,068 samples, 0.17%) + + + +auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (46,717,102 samples, 0.08%) + + + +dml::core::dispatcher::hw_dispatcher::get_instance (25,315,981 samples, 0.05%) + + + +[[kernel.kallsyms]] (7,344,388 samples, 0.01%) + + + +__GI_mprotect (34,047,863 samples, 0.06%) + + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +__GI_mprotect (44,111,847 samples, 0.08%) + + + +numa_node_size64 (17,944,285 samples, 0.03%) + + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (10,757,457 samples, 0.02%) + + + +scan_a (3,462,002,037 samples, 6.16%) +scan_a + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +[[kernel.kallsyms]] (47,784,777 samples, 0.09%) + + + +[[kernel.kallsyms]] (11,243,448 samples, 0.02%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (9,861,578 samples, 0.02%) + + + +dsacache::Cache::ExecuteCopy (49,871,202 samples, 0.09%) + + + +grow_heap (50,154,117 samples, 0.09%) + + + +[[kernel.kallsyms]] (24,140,143 samples, 0.04%) + + + +[[kernel.kallsyms]] (8,647,063 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,318,881,921 samples, 13.02%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (6,540,584 samples, 0.01%) + + + +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (4,015,533,329 samples, 7.14%) +std::mers.. + + +[[kernel.kallsyms]] (10,376,330 samples, 0.02%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (9,861,578 samples, 0.02%) + + + +[libstdc++.so.6.0.32] (6,568,910,875 samples, 11.69%) +[libstdc++.so.6.0.. + + +[[kernel.kallsyms]] (5,197,845 samples, 0.01%) + + + +[[kernel.kallsyms]] (80,579,994 samples, 0.14%) + + + +[[kernel.kallsyms]] (11,778,485 samples, 0.02%) + + + +numa_alloc_onnode (59,682,970 samples, 0.11%) + + + +[[kernel.kallsyms]] (45,789,329 samples, 0.08%) + + + +[[kernel.kallsyms]] (50,154,117 samples, 0.09%) + + + +dml::core::dispatcher::hw_dispatcher::hw_dispatcher (23,965,836 samples, 0.04%) + + + +[[kernel.kallsyms]] (52,197,849 samples, 0.09%) + + + +_int_memalign (10,813,731 samples, 0.02%) + + + +__sysfs_device_parse (16,212,211 samples, 0.03%) + + + +[[kernel.kallsyms]] (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (258,083,965 samples, 0.46%) + + + +[[kernel.kallsyms]] (5,333,154 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,542,336 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,240,609,876 samples, 12.88%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (78,330,121 samples, 0.14%) + + + +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +[[kernel.kallsyms]] (80,579,994 samples, 0.14%) + + + +[[kernel.kallsyms]] (13,726,772 samples, 0.02%) + + + +[[kernel.kallsyms]] (4,860,205 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,774,676 samples, 0.01%) + + + +[[kernel.kallsyms]] (33,198,896 samples, 0.06%) + + + +[[kernel.kallsyms]] (10,757,457 samples, 0.02%) + + + +[[kernel.kallsyms]] (31,728,822 samples, 0.06%) + + + +[[kernel.kallsyms]] (15,623,013 samples, 0.03%) + + + +[[kernel.kallsyms]] (82,999,130 samples, 0.15%) + + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +[[kernel.kallsyms]] (4,837,589 samples, 0.01%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +[[kernel.kallsyms]] (25,484,485 samples, 0.05%) + + + +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::handler (10,813,731 samples, 0.02%) + + + +[[kernel.kallsyms]] (52,197,849 samples, 0.09%) + + + +__GI___getdelim (7,608,450 samples, 0.01%) + + + +dsacache::Cache::AllocOnNode (77,976,052 samples, 0.14%) + + + +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get (27,883,144,753 samples, 49.61%) +dml::handler<dml::mem_copy_operation, std::allocator<unsigned char> >::get + + +[[kernel.kallsyms]] (13,547,223 samples, 0.02%) + + + +[[kernel.kallsyms]] (256,851,116 samples, 0.46%) + + + +[[kernel.kallsyms]] (89,913,865 samples, 0.16%) + + + +void std::allocator_traits<std::allocator<std::thread> >::construct<std::thread, void (12,724,581 samples, 0.02%) + + + +std::thread::join (6,839,397 samples, 0.01%) + + + +[[kernel.kallsyms]] (19,162,146 samples, 0.03%) + + + +[[kernel.kallsyms]] (15,623,013 samples, 0.03%) + + + +__pthread_clockjoin_ex (6,839,397 samples, 0.01%) + + + +_mm512_stream_load_si512 (2,993,062,044 samples, 5.33%) +_mm512.. + + +[[kernel.kallsyms]] (12,973,510 samples, 0.02%) + + + +dml::detail::ml::task<std::allocator<unsigned char> >::task (52,769,643 samples, 0.09%) + + + +[[kernel.kallsyms]] (90,779,509 samples, 0.16%) + + + +[[kernel.kallsyms]] (14,066,357 samples, 0.03%) + + + +[[kernel.kallsyms]] (15,623,013 samples, 0.03%) + + + +[[kernel.kallsyms]] (25,315,610 samples, 0.05%) + + + +[[kernel.kallsyms]] (11,870,294 samples, 0.02%) + + + +sh (5,618,491 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,000,961,265 samples, 10.68%) +[[kernel.kallsy.. + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +_IO_new_file_underflow (19,746,317 samples, 0.04%) + + + +[[kernel.kallsyms]] (15,896,933 samples, 0.03%) + + + +[[kernel.kallsyms]] (43,636,433 samples, 0.08%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (46,717,102 samples, 0.08%) + + + +[[kernel.kallsyms]] (52,197,849 samples, 0.09%) + + + +[[kernel.kallsyms]] (23,722,633 samples, 0.04%) + + + +[[kernel.kallsyms]] (8,649,622 samples, 0.02%) + + + +all (56,200,887,845 samples, 100%) + + + +devices_init (6,615,510 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,052,541 samples, 0.01%) + + + +[[kernel.kallsyms]] (6,835,870 samples, 0.01%) + + + +[[kernel.kallsyms]] (44,111,847 samples, 0.08%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (31,692,824 samples, 0.06%) + + + +[[kernel.kallsyms]] (11,169,708 samples, 0.02%) + + + +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (35,828,488 samples, 0.06%) + + + +[[kernel.kallsyms]] (6,066,302 samples, 0.01%) + + + +[[kernel.kallsyms]] (5,721,684 samples, 0.01%) + + + +[[kernel.kallsyms]] (7,311,966,115 samples, 13.01%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (92,507,760 samples, 0.16%) + + + +[[kernel.kallsyms]] (36,196,373 samples, 0.06%) + + + +[[kernel.kallsyms]] (15,896,933 samples, 0.03%) + + + +[[kernel.kallsyms]] (56,894,061 samples, 0.10%) + + + +[[kernel.kallsyms]] (259,363,157 samples, 0.46%) + + + +[[kernel.kallsyms]] (19,185,252 samples, 0.03%) + + + +[[kernel.kallsyms]] (9,972,283 samples, 0.02%) + + + +dsacache::Cache::SubmitTask (167,119,904 samples, 0.30%) + + + +dsacache::Cache::SubmitTask (291,889,749 samples, 0.52%) + + + +[anon] (52,831,798 samples, 0.09%) + + + +[[kernel.kallsyms]] (6,774,676 samples, 0.01%) + + + +[[kernel.kallsyms]] (26,878,702 samples, 0.05%) + + + +__GI___mmap64 (32,804,268 samples, 0.06%) + + + +[[kernel.kallsyms]] (8,424,524 samples, 0.01%) + + + +std::thread& std::vector<std::thread, std::allocator<std::thread> >::emplace_back<void (13,581,055 samples, 0.02%) + + + +[[kernel.kallsyms]] (7,324,068,091 samples, 13.03%) +[[kernel.kallsyms]] + + +[[kernel.kallsyms]] (13,838,204 samples, 0.02%) + + + +[[kernel.kallsyms]] (5,188,239 samples, 0.01%) + + + +[[kernel.kallsyms]] (13,025,872 samples, 0.02%) + + + +[[kernel.kallsyms]] (22,884,622 samples, 0.04%) + + + +