diff --git a/qdp_project/CMakeLists.txt b/qdp_project/CMakeLists.txt index 29385fa..d90d091 100644 --- a/qdp_project/CMakeLists.txt +++ b/qdp_project/CMakeLists.txt @@ -19,11 +19,13 @@ set(SUPPRESS_WARNINGS "-Wno-literal-suffix -Wno-volatile") set(DEBUG_FLAGS "-g3" "-ggdb") set(RELEASE_FLAGS "-O3") +set(RELWITHDEBINFO_FLAGS "-O2" "-ggdb3" "-fno-omit-frame-pointer") #set flags used for Release and Debug build type add_compile_options( "$<$:${RELEASE_FLAGS}>" "$<$:${DEBUG_FLAGS}>" + "$<$:${RELWITHDEBINFO_FLAGS}>" ) # include directories diff --git a/qdp_project/evaluation-results/perf.svg b/qdp_project/evaluation-results/perf.svg index afa37ef..3b1d655 100644 --- a/qdp_project/evaluation-results/perf.svg +++ b/qdp_project/evaluation-results/perf.svg @@ -1,6 +1,6 @@ - + @@ -421,1569 +421,1201 @@ } ]]> - + Flame Graph - + Reset Zoom Search ic - + -[[kernel.kallsyms]] (1,094,452 samples, 0.01%) - +futex_wait_queue (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +clear_huge_page (327,031,017 samples, 2.43%) +cl.. -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +__mmu_notifier_invalidate_range_end (4,315,626 samples, 0.03%) + -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - +std::_Hashtable<unsigned char*, std::pair<unsigned char* const, dsacache::CacheData>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> >, std::__detail::_Select1st, std::equal_to<unsigned char*>, std::hash<unsigned char*>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<false, false, true> >::clear (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (18,810,377 samples, 0.15%) - +perf_iterate_ctx (1,724,604 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +void caching<1ul> (40,431,525 samples, 0.30%) + -Vector_Loader<unsigned long, (279,005,046 samples, 2.18%) -V.. +__mmu_notifier_invalidate_range_end (6,744,844 samples, 0.05%) + -dl_open_worker_begin (1,262,425 samples, 0.01%) - +_mid_memalign (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - +std::__tree_barrier<NopStruct>::wait (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (5,959,122 samples, 0.05%) - +__GI_munmap (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +_int_memalign (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +std::chrono::_V2::steady_clock::now (1,691,962,679 samples, 12.55%) +std::chrono::_V2::.. -[[kernel.kallsyms]] (1,832,823,392 samples, 14.33%) -[[kernel.kallsyms]] +exc_page_fault (1,819,294,773 samples, 13.50%) +exc_page_fault -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - +free_unref_page (12,087,054 samples, 0.09%) + -__GI___clock_gettime (141,009,347 samples, 1.10%) - +dml::core::dispatcher::hw_dispatcher::~hw_dispatcher (1,679,595 samples, 0.01%) + -_int_malloc (2,585,739 samples, 0.02%) - +__sysvec_apic_timer_interrupt (2,568,736 samples, 0.02%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +perf_event_mmap (1,724,604 samples, 0.01%) + -_dl_catch_error (1,262,425 samples, 0.01%) - +advise_stack_range (1,739,797 samples, 0.01%) + -clock_gettime@plt (24,322,123 samples, 0.19%) - +perf_adjust_freq_unthr_context (4,299,293 samples, 0.03%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +qi_flush_dev_iotlb_pasid (1,726,916 samples, 0.01%) + -devices_init (5,339,194 samples, 0.04%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (2,642,523,934 samples, 19.61%) +unsigned long std::uniform_int.. -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +grow_heap (25,295,540 samples, 0.19%) + -std::chrono::_V2::steady_clock::now (84,598,147 samples, 0.66%) - +std::__detail::__waiter_pool::_M_do_wait (1,739,970 samples, 0.01%) + -_IO_new_file_underflow (3,448,158 samples, 0.03%) - +qi_submit_sync (2,588,710 samples, 0.02%) + -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +handle_mm_fault (6,022,765 samples, 0.04%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +__folio_alloc (2,574,263 samples, 0.02%) + -dsacache::Cache::Access (1,735,833 samples, 0.01%) - +dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (25,295,540 samples, 0.19%) + -dml::core::dispatcher::hw_dispatcher::get_instance (11,911,549 samples, 0.09%) - +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (959,605,276 samples, 7.12%) +std::mers.. -std::barrier<NopStruct>::wait (2,605,852 samples, 0.02%) - +do_huge_pmd_anonymous_page (4,311,772 samples, 0.03%) + -dsacache::Cache::AllocOnNode (5,157,234 samples, 0.04%) - +grow_heap (6,744,844 samples, 0.05%) + -dml::detail::ml::impl::hardware::submit (11,911,549 samples, 0.09%) - +Filter<unsigned long, LT, (5,411,903,209 samples, 40.15%) +Filter<unsigned long, LT, -[unknown] (479,146,399 samples, 3.75%) -[unk.. +vma_alloc_folio (2,574,263 samples, 0.02%) + -[[kernel.kallsyms]] (1,094,452 samples, 0.01%) - +__mmu_notifier_invalidate_range_end (23,570,936 samples, 0.17%) + -scan_b (46,025,734 samples, 0.36%) - +__GI_munmap (17,268,198 samples, 0.13%) + -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - +entry_SYSCALL_64_after_hwframe (2,824,555 samples, 0.02%) + -dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (40,290,223 samples, 0.32%) - +sysvec_apic_timer_interrupt (6,036,217 samples, 0.04%) + -read (1,775,776 samples, 0.01%) - +dml::handler<dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >::allocator_type> dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (31,098,656 samples, 0.23%) + -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +syscall (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +_raw_spin_lock_irqsave (1,574,388 samples, 0.01%) + -[[kernel.kallsyms]] (15,448,011 samples, 0.12%) - +do_syscall_64 (2,824,555 samples, 0.02%) + -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - +auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (31,098,656 samples, 0.23%) + -[[kernel.kallsyms]] (1,273,668,574 samples, 9.96%) -[[kernel.kalls.. +dsacache::Cache::Access (40,431,525 samples, 0.30%) + -__GI__IO_doallocbuf (2,585,739 samples, 0.02%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (2,636,449,788 samples, 19.56%) +unsigned long std::uniform_int.. -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - +asm_exc_page_fault (1,821,899,562 samples, 13.52%) +asm_exc_page_fault -Vector_Loader<unsigned long, (201,075,858 samples, 1.57%) - +__GI__IO_file_doallocate (7,607,464 samples, 0.06%) + -[[kernel.kallsyms]] (1,094,452 samples, 0.01%) - +sync_regs (2,604,789 samples, 0.02%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +qi_flush_piotlb (10,915,702 samples, 0.08%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +qi_submit_sync (1,726,916 samples, 0.01%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +std::barrier<NopStruct>::arrive_and_wait (1,739,970 samples, 0.01%) + -add_wq (8,897,243 samples, 0.07%) - +try_charge_memcg (1,735,920 samples, 0.01%) + -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - +dsacache::Cache::AllocOnNode (9,332,869 samples, 0.07%) + -[[kernel.kallsyms]] (1,739,978 samples, 0.01%) - +perf_event_task_tick (20,760,710 samples, 0.15%) + -QDPBench (12,788,400,850 samples, 100.00%) -QDPBench +perf_event_task_tick (4,299,293 samples, 0.03%) + -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - +__x64_sys_munmap (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (1,832,823,392 samples, 14.33%) -[[kernel.kallsyms]] +copy_process (3,468,037 samples, 0.03%) + -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (407,167,062 samples, 3.18%) -std.. +std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > (19,094,643 samples, 0.14%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +__GI_madvise (1,739,797 samples, 0.01%) + -void std::__detail::__platform_wait<int> (1,737,280 samples, 0.01%) - +unmap_region (4,315,626 samples, 0.03%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (1,944,872,025 samples, 15.21%) -unsigned long std::unif.. +dml::core::dispatcher::hw_dispatcher::hw_dispatcher (4,958,906 samples, 0.04%) + -[[kernel.kallsyms]] (1,830,230,761 samples, 14.31%) -[[kernel.kallsyms]] +zap_huge_pmd (2,590,090 samples, 0.02%) + -sysmalloc (2,585,739 samples, 0.02%) - +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_nodes (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - +void std::__detail::__waiter<std::integral_constant<bool, true> >::_M_do_wait<std::__tree_barrier<NopStruct>::wait (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - +std::chrono::_V2::steady_clock::now (1,714,183,943 samples, 12.72%) +std::chrono::_V2::s.. -[[kernel.kallsyms]] (17,941,270 samples, 0.14%) - +dsacache::Cache::SubmitTask (40,431,525 samples, 0.30%) + -sysmalloc (28,378,674 samples, 0.22%) - +__GI__IO_doallocbuf (7,607,464 samples, 0.06%) + -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - +tick_sched_timer (5,167,785 samples, 0.04%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +__memcg_kmem_charge_page (2,603,722 samples, 0.02%) + -[[kernel.kallsyms]] (5,089,546 samples, 0.04%) - +perf_iterate_sb.constprop.0 (1,724,604 samples, 0.01%) + -groups_init (3,101,168 samples, 0.02%) - +entry_SYSCALL_64_after_hwframe (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +__list_del_entry_valid (1,735,652 samples, 0.01%) + -std::__detail::__waiter_pool::_M_do_wait (1,735,727 samples, 0.01%) - +do_syscall_64 (1,739,970 samples, 0.01%) + -dl_main (2,239,586 samples, 0.02%) - +scheduler_tick (1,704,757 samples, 0.01%) + -[[kernel.kallsyms]] (1,724,889 samples, 0.01%) - +perf_event_task_tick (1,704,757 samples, 0.01%) + -[[kernel.kallsyms]] (1,724,889 samples, 0.01%) - +__GI___getdelim (8,469,848 samples, 0.06%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +do_huge_pmd_anonymous_page (1,816,689,561 samples, 13.48%) +do_huge_pmd_anonymou.. -device_parse (3,101,168 samples, 0.02%) - +dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (25,295,540 samples, 0.19%) + -[[stack]] (319,807,366 samples, 2.50%) -[[.. +perf_event_init_task (2,599,737 samples, 0.02%) + -std::thread::_M_start_thread (2,587,208 samples, 0.02%) - +qi_submit_sync (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (3,733,433 samples, 0.03%) - +__GI_exit (1,679,595 samples, 0.01%) + -__sysfs_device_parse (3,101,168 samples, 0.02%) - +tick_sched_handle (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (17,084,810 samples, 0.13%) - +[[vdso]] (1,255,679,649 samples, 9.32%) +[[vdso]] -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +update_process_times (2,568,736 samples, 0.02%) + -[[kernel.kallsyms]] (5,089,546 samples, 0.04%) - +asm_sysvec_apic_timer_interrupt (2,568,736 samples, 0.02%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +do_syscall_64 (3,468,037 samples, 0.03%) + -__libc_start_main_impl (4,778,433,373 samples, 37.37%) -__libc_start_main_impl +get_page_from_freelist (2,574,263 samples, 0.02%) + -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - +clear_page_erms (1,737,509 samples, 0.01%) + -syscall (1,735,727 samples, 0.01%) - +do_vmi_align_munmap (1,679,595 samples, 0.01%) + -_mm512_cmplt_epi64_mask (3,470,584 samples, 0.03%) - +do_user_addr_fault (6,888,173 samples, 0.05%) + -[[kernel.kallsyms]] (1,094,452 samples, 0.01%) - +std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000000000l>, long, std::ratio<1l, 1000000000l> > (19,094,643 samples, 0.14%) + -advise_stack_range (1,737,656 samples, 0.01%) - +sysvec_apic_timer_interrupt (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (1,677,102,236 samples, 12.44%) +unsigned int std::.. -void std::__detail::__waiter<std::integral_constant<bool, true> >::_M_do_wait<std::__tree_barrier<NopStruct>::wait (3,474,594 samples, 0.03%) - +tick_sched_timer (2,573,741 samples, 0.02%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +entry_SYSCALL_64_after_hwframe (1,739,797 samples, 0.01%) + -dsacache::Cache::SubmitTask (11,911,549 samples, 0.09%) - +do_filp_open (2,824,555 samples, 0.02%) + -[[kernel.kallsyms]] (24,927,309 samples, 0.19%) - +unmap_region (17,268,198 samples, 0.13%) + -__GI_mprotect (25,788,811 samples, 0.20%) - +std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (25,295,540 samples, 0.19%) + -Filter<unsigned long, LT, (3,282,302,648 samples, 25.67%) -Filter<unsigned long, LT, +change_protection (23,570,936 samples, 0.17%) + -_int_memalign (28,378,674 samples, 0.22%) - +intel_invalidate_range (4,315,626 samples, 0.03%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (2,210,412,779 samples, 17.28%) -unsigned long std::uniform.. +free_tail_page_prepare (9,496,754 samples, 0.07%) + -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - +asm_exc_page_fault (6,888,173 samples, 0.05%) + -dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (28,378,674 samples, 0.22%) - +folio_add_lru (1,574,388 samples, 0.01%) + -dsacache::Cache::GetCacheNode (1,735,833 samples, 0.01%) - +update_process_times (5,167,785 samples, 0.04%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +__run_exit_handlers (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +Sum<unsigned long>::simd_agg (3,349,026 samples, 0.02%) + -void std::__atomic_wait_address<std::__barrier_phase_t, std::__tree_barrier<NopStruct>::wait (3,474,594 samples, 0.03%) - +schedule (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (1,508,809,254 samples, 11.80%) -[[kernel.kallsyms]] +qi_flush_piotlb (2,588,710 samples, 0.02%) + -[[kernel.kallsyms]] (6,948,430 samples, 0.05%) - +[[vdso]] (1,207,632,714 samples, 8.96%) +[[vdso]] -[[kernel.kallsyms]] (1,148,662 samples, 0.01%) - +dsacache::Cache::ExecuteCopy (31,098,656 samples, 0.23%) + -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - +__libc_openat64 (2,824,555 samples, 0.02%) + -_dl_load_cache_lookup (1,262,425 samples, 0.01%) - +hrtimer_interrupt (2,573,741 samples, 0.02%) + -auto dml::detail::submit<dml::hardware, dml::mem_copy_operation, dml::execution_interface<dml::hardware, std::allocator<unsigned char> >, dml::submit<dml::hardware, dml::execution_interface<dml::hardware, std::allocator<unsigned char> > > (40,290,223 samples, 0.32%) - +__GI__IO_doallocbuf (7,607,464 samples, 0.06%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +clone3 (7,578,221,982 samples, 56.23%) +clone3 -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - +mprotect_fixup (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +__schedule (1,739,970 samples, 0.01%) + -syscall (1,737,164 samples, 0.01%) - +__vm_munmap (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (17,941,270 samples, 0.14%) - +entry_SYSCALL_64_after_hwframe (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +__x64_sys_munmap (17,268,198 samples, 0.13%) + -_dl_sysdep_read_whole_file (1,262,425 samples, 0.01%) - +QDPBench (13,478,052,093 samples, 100.00%) +QDPBench -[[kernel.kallsyms]] (1,725,071 samples, 0.01%) - +__hrtimer_run_queues (2,573,741 samples, 0.02%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +__sysvec_apic_timer_interrupt (3,442,758 samples, 0.03%) + -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - +std::pair<unsigned char* const, dsacache::CacheData>::~pair (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (4,158,226 samples, 0.03%) - +__do_sys_clone3 (3,468,037 samples, 0.03%) + -__GI___libc_read (1,775,776 samples, 0.01%) - +__alloc_pages (2,574,263 samples, 0.02%) + -operator new (28,378,674 samples, 0.22%) - +void std::__detail::__platform_wait<int> (1,739,970 samples, 0.01%) + -std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000000000l>, long, std::ratio<1l, 1000000000l> > (3,498,307 samples, 0.03%) - +do_vmi_munmap (5,177,974 samples, 0.04%) + -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (1,295,771,478 samples, 10.13%) -unsigned int s.. +__hrtimer_run_queues (5,167,785 samples, 0.04%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +get_page_from_freelist (1,474,202,521 samples, 10.94%) +get_page_from_fr.. -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +do_syscall_64 (6,744,844 samples, 0.05%) + -[[kernel.kallsyms]] (5,959,122 samples, 0.05%) - +unmap_page_range (2,590,090 samples, 0.02%) + -syscall (1,737,280 samples, 0.01%) - +do_anonymous_page (1,710,993 samples, 0.01%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +kernfs_fop_open (1,241,282 samples, 0.01%) + -[[kernel.kallsyms]] (10,320,932 samples, 0.08%) - +folio_batch_move_lru (1,574,388 samples, 0.01%) + -void caching<1ul> (45,447,457 samples, 0.36%) - +unmap_region (1,679,595 samples, 0.01%) + -__libc_start_call_main (4,778,433,373 samples, 37.37%) -__libc_start_call_main +clear_page_erms (2,574,263 samples, 0.02%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +asm_sysvec_apic_timer_interrupt (6,034,647 samples, 0.04%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +_start (5,290,426,542 samples, 39.25%) +_start -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +do_sys_openat2 (2,824,555 samples, 0.02%) + -__GI_mprotect (2,585,739 samples, 0.02%) - +do_vmi_munmap (1,679,595 samples, 0.01%) + -std::__detail::__waiter_base<std::__detail::__waiter_pool>::_M_notify (1,737,164 samples, 0.01%) - +dml::core::dispatcher::hw_dispatcher::get_instance (4,958,907 samples, 0.04%) + -[[kernel.kallsyms]] (1,148,662 samples, 0.01%) - +__sysvec_apic_timer_interrupt (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (1,724,889 samples, 0.01%) - +perf_adjust_freq_unthr_context (20,760,710 samples, 0.15%) + -__GI___getdelim (3,448,158 samples, 0.03%) - +sysmalloc (7,607,464 samples, 0.06%) + -__GI_munmap (10,426,718 samples, 0.08%) - +qi_flush_dev_iotlb_pasid (4,174,802 samples, 0.03%) + -std::__atomic_ref<std::__barrier_phase_t, false, false>::notify_all (1,737,164 samples, 0.01%) - +clock_gettime@plt (33,987,260 samples, 0.25%) + -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - +add_wq (4,519,291 samples, 0.03%) + -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +folio_lruvec_lock_irqsave (1,574,388 samples, 0.01%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +__GI___libc_malloc (7,607,464 samples, 0.06%) + -dml::core::dispatcher::hw_dispatcher::initialize_hw (11,911,549 samples, 0.09%) - +do_dentry_open (1,241,282 samples, 0.01%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (608,239,162 samples, 4.51%) +unsig.. -[[kernel.kallsyms]] (17,207,204 samples, 0.13%) - +dsacache::CacheData::~CacheData (5,177,974 samples, 0.04%) + -dlopen_doit (1,262,425 samples, 0.01%) - +__handle_mm_fault (6,022,765 samples, 0.04%) + -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - +update_process_times (20,760,710 samples, 0.15%) + -void std::__detail::__waiter<std::integral_constant<bool, true> >::_M_do_wait<std::__tree_barrier<NopStruct>::wait (2,605,852 samples, 0.02%) - +sysmalloc (25,295,540 samples, 0.19%) + -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (12,962,126 samples, 0.10%) - +__x64_sys_munmap (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +unmap_vmas (4,315,626 samples, 0.03%) + -[[kernel.kallsyms]] (1,832,823,392 samples, 14.33%) -[[kernel.kallsyms]] +intel_invalidate_range (1,679,595 samples, 0.01%) + -__GI_sched_yield (1,737,314 samples, 0.01%) - +scan_b (40,431,526 samples, 0.30%) + -LT<unsigned long>::simd_filter (3,470,584 samples, 0.03%) - +unmap_vmas (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +entry_SYSCALL_64_after_hwframe (5,177,974 samples, 0.04%) + -accfg_get_param_long (2,238,026 samples, 0.02%) - +accfg_get_param_long (3,788,940 samples, 0.03%) + -void std::__detail::__platform_notify<int> (1,737,164 samples, 0.01%) - +std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (25,295,540 samples, 0.19%) + -__sysfs_read_attr (1,751,881 samples, 0.01%) - +unsigned long std::uniform_int_distribution<unsigned long>::operator (608,239,162 samples, 4.51%) +unsig.. -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - +unmap_vmas (2,590,090 samples, 0.02%) + -std::thread::thread<void (2,587,208 samples, 0.02%) - +std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (25,295,540 samples, 0.19%) + -scan_a (3,288,832,947 samples, 25.72%) -scan_a +intel_invalidate_range (6,744,844 samples, 0.05%) + -__GI__dl_catch_exception (1,262,425 samples, 0.01%) - +sysvec_apic_timer_interrupt (2,568,736 samples, 0.02%) + -dml::core::hardware_device::submit (11,911,549 samples, 0.09%) - +dml::core::hardware_device::submit (5,803,116 samples, 0.04%) + -clone3 (3,654,081,491 samples, 28.57%) -clone3 +dml::detail::ml::task<std::allocator<unsigned char> >::task (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - +dsacache::CacheData::WaitOnCompletion (5,177,974 samples, 0.04%) + -openat (1,143,574 samples, 0.01%) - +do_syscall_64 (5,177,974 samples, 0.04%) + -dml::detail::ml::task<std::allocator<unsigned char> >::task (28,378,674 samples, 0.22%) - +do_futex (1,739,970 samples, 0.01%) + -dml::core::dispatcher::hw_dispatcher::hw_dispatcher (11,911,549 samples, 0.09%) - +inherit_task_group.isra.0 (2,599,737 samples, 0.02%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +qi_submit_sync (4,174,802 samples, 0.03%) + -Sum<unsigned long>::simd_agg (18,513,417 samples, 0.14%) - +perf_event_alloc (1,734,813 samples, 0.01%) + -std::barrier<NopStruct>::arrive_and_wait (6,080,084 samples, 0.05%) - +__x64_sys_futex (1,739,970 samples, 0.01%) + -[[kernel.kallsyms]] (1,830,230,761 samples, 14.31%) -[[kernel.kallsyms]] +_int_malloc (7,607,464 samples, 0.06%) + -[[kernel.kallsyms]] (17,941,270 samples, 0.14%) - +do_user_addr_fault (1,819,294,773 samples, 13.50%) +do_user_addr_fault -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +__rmqueue_pcplist (8,678,276 samples, 0.06%) + -clock_gettime@plt (13,672,702 samples, 0.11%) - +__GI___clock_gettime (1,631,208,158 samples, 12.10%) +__GI___clock_gettime -std::barrier<NopStruct>::arrive_and_wait (2,605,852 samples, 0.02%) - +__vdso_clock_gettime (1,595,028,674 samples, 11.83%) +__vdso_clock_gett.. -sum_check (352,075,561 samples, 2.75%) -su.. +entry_SYSCALL_64_after_hwframe (17,268,198 samples, 0.13%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +__sysvec_apic_timer_interrupt (6,036,217 samples, 0.04%) + -std::barrier<NopStruct>::arrive (2,605,490 samples, 0.02%) - +dsacache::Cache::SubmitTask (5,803,116 samples, 0.04%) + -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - +hrtimer_interrupt (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (17,941,270 samples, 0.14%) - +_mm512_mask_add_epi64 (3,349,026 samples, 0.02%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +__handle_mm_fault (1,818,426,587 samples, 13.49%) +__handle_mm_fault -[[kernel.kallsyms]] (5,959,122 samples, 0.05%) - +__x64_sys_openat (2,824,555 samples, 0.02%) + -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - +__libc_start_main_impl (5,290,426,542 samples, 39.25%) +__libc_start_main_impl -dlopen_implementation (1,262,425 samples, 0.01%) - +scheduler_tick (4,299,293 samples, 0.03%) + -[[kernel.kallsyms]] (1,086,257 samples, 0.01%) - +do_vmi_align_munmap (5,177,974 samples, 0.04%) + -_mid_memalign (28,378,674 samples, 0.22%) - +tick_sched_handle (5,167,785 samples, 0.04%) + -device_parse (8,897,243 samples, 0.07%) - +asm_sysvec_apic_timer_interrupt (6,893,223 samples, 0.05%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +tlb_finish_mmu (14,678,108 samples, 0.11%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +void std::__atomic_wait_address<std::__barrier_phase_t, std::__tree_barrier<NopStruct>::wait (1,739,970 samples, 0.01%) + -void fill_mt<unsigned long> (4,401,975,708 samples, 34.42%) -void fill_mt<unsigned long> +operator new (25,295,540 samples, 0.19%) + -dl_open_worker (1,262,425 samples, 0.01%) - +__x64_sys_mprotect (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +do_vmi_align_munmap (17,268,198 samples, 0.13%) + -std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (28,378,674 samples, 0.22%) - +__folio_alloc (1,475,070,703 samples, 10.94%) +__folio_alloc -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - +__vm_munmap (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +__vdso_clock_gettime (1,597,729,157 samples, 11.85%) +__vdso_clock_gett.. -[anon] (154,796,010 samples, 1.21%) - +handle_mm_fault (1,818,426,587 samples, 13.49%) +handle_mm_fault -__GI_mprotect (1,106,913 samples, 0.01%) - +start_thread (7,574,753,945 samples, 56.20%) +start_thread -void std::__atomic_impl::notify_all<std::__barrier_phase_t> (1,737,164 samples, 0.01%) - +openat (2,824,555 samples, 0.02%) + -main (4,776,770,397 samples, 37.35%) -main +__GI_mprotect (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +__GI_mprotect (6,744,844 samples, 0.05%) + -_dl_check_all_versions (1,132,673 samples, 0.01%) - +scan_a (5,412,771,977 samples, 40.16%) +scan_a -[[kernel.kallsyms]] (1,810,367,544 samples, 14.16%) -[[kernel.kallsyms]] +auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (25,295,540 samples, 0.19%) + -__GI_exit (1,662,976 samples, 0.01%) - +tick_sched_handle (1,704,757 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +change_protection (6,744,844 samples, 0.05%) + -void std::__atomic_notify_address<std::__barrier_phase_t> (1,737,164 samples, 0.01%) - +wqs_init (4,519,291 samples, 0.03%) + -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +path_openat (2,824,555 samples, 0.02%) + -__sysfs_device_parse (2,238,026 samples, 0.02%) - +do_syscall_64 (17,268,198 samples, 0.13%) + -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - +all (13,478,052,094 samples, 100%) + -[[kernel.kallsyms]] (5,959,122 samples, 0.05%) - +tick_sched_timer (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (13,817,985 samples, 0.11%) - +__vm_munmap (17,268,198 samples, 0.13%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +update_process_times (1,704,757 samples, 0.01%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +tick_sched_handle (2,568,736 samples, 0.02%) + -_int_malloc (28,378,674 samples, 0.22%) - +void fill_mt<unsigned long> (4,915,410,540 samples, 36.47%) +void fill_mt<unsigned long> -std::chrono::_V2::steady_clock::now (97,534,503 samples, 0.76%) - +tick_sched_timer (2,568,736 samples, 0.02%) + -std::__tree_barrier<NopStruct>::wait (2,605,852 samples, 0.02%) - +do_vmi_munmap (17,268,198 samples, 0.13%) + -_dl_check_map_versions (1,132,673 samples, 0.01%) - +release_pages (13,814,219 samples, 0.10%) + -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - +numa_node_size64 (9,332,869 samples, 0.07%) + -std::__detail::__waiter_pool_base::_M_notify (1,737,164 samples, 0.01%) - +_int_malloc (25,295,540 samples, 0.19%) + -dml::core::dispatcher::hw_dispatcher::~hw_dispatcher (1,662,976 samples, 0.01%) - +main (5,288,746,947 samples, 39.24%) +main -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (608,239,162 samples, 4.51%) +std::.. -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +do_madvise (1,739,797 samples, 0.01%) + -[[kernel.kallsyms]] (11,267,710 samples, 0.09%) - +exc_page_fault (6,888,173 samples, 0.05%) + -__libc_openat64 (1,143,574 samples, 0.01%) - +do_mprotect_pkey (6,744,844 samples, 0.05%) + -dsacache::Cache::SubmitTask (45,447,457 samples, 0.36%) - +dsacache::Cache::Clear (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +clear_huge_page (1,737,509 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +dml::core::dispatcher::hw_device::initialize_new_device (4,958,905 samples, 0.04%) + -__libc_openat64 (6,243,993 samples, 0.05%) - +_raw_spin_lock (6,071,274 samples, 0.05%) + -__GI_madvise (1,737,656 samples, 0.01%) - +__cond_resched (2,603,227 samples, 0.02%) + -dsacache::Cache::ExecuteCopy (40,290,223 samples, 0.32%) - +pte_alloc_one (4,339,374 samples, 0.03%) + -version_check_doit (1,132,673 samples, 0.01%) - +sysvec_apic_timer_interrupt (4,298,295 samples, 0.03%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (306,053,484 samples, 2.39%) -u.. +qi_submit_sync (12,655,234 samples, 0.09%) + -dml::core::dispatcher::hw_device::initialize_new_device (10,649,124 samples, 0.08%) - +__hrtimer_run_queues (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - +std::__detail::_Hashtable_alloc<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::_M_deallocate_node (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - +asm_sysvec_apic_timer_interrupt (20,760,710 samples, 0.15%) + -__GI__dl_catch_exception (1,262,425 samples, 0.01%) - +entry_SYSCALL_64_after_hwframe (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +__x64_sys_madvise (1,739,797 samples, 0.01%) + -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +__GI_munmap (5,177,974 samples, 0.04%) + -_mm512_stream_load_si512 (279,005,046 samples, 2.18%) -_.. +do_anonymous_page (1,737,026 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +mprotect_fixup (6,744,844 samples, 0.05%) + -dsacache::CacheData::WaitOnCompletion (11,295,967 samples, 0.09%) - +get_page_from_freelist (1,735,652 samples, 0.01%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +__hrtimer_run_queues (2,568,736 samples, 0.02%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +tlb_batch_pages_flush (13,814,219 samples, 0.10%) + -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +std::unordered_map<unsigned char*, dsacache::CacheData, std::hash<unsigned char*>, std::equal_to<unsigned char*>, std::allocator<std::pair<unsigned char* const, dsacache::CacheData> > >::clear (5,177,974 samples, 0.04%) + -_dlerror_run (1,262,425 samples, 0.01%) - +entry_SYSCALL_64_after_hwframe (3,468,037 samples, 0.03%) + -__GI_munmap (1,662,976 samples, 0.01%) - +dml::core::dispatcher::hw_dispatcher::initialize_hw (4,958,906 samples, 0.04%) + -[[kernel.kallsyms]] (1,724,282 samples, 0.01%) - +aggr_j (2,119,810,645 samples, 15.73%) +aggr_j -[[vdso]] (3,393,286,810 samples, 26.53%) -[[vdso]] +clear_page_erms (1,214,507,617 samples, 9.01%) +clear_page_erms -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - +device_parse (4,519,291 samples, 0.03%) + -__pthread_create_2_1 (2,587,208 samples, 0.02%) - +task_tick_fair (1,734,074 samples, 0.01%) + -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - +accfg_wq_get_first (4,519,291 samples, 0.03%) + -std::__detail::__thread_yield (1,737,314 samples, 0.01%) - +dsacache::Cache::Access (1,737,345 samples, 0.01%) + -_dl_receive_error (1,132,673 samples, 0.01%) - +do_syscall_64 (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (1,725,071 samples, 0.01%) - +Aggregation<unsigned long, Sum, (2,116,333,329 samples, 15.70%) +Aggregation<unsigned lon.. -_dl_protect_relro (1,106,913 samples, 0.01%) - +__libc_start_call_main (5,290,426,542 samples, 39.25%) +__libc_start_call_main -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - +qi_flush_piotlb (2,570,042 samples, 0.02%) + -unsigned long std::uniform_int_distribution<unsigned long>::operator (319,807,366 samples, 2.50%) -un.. +vma_alloc_folio (1,475,070,703 samples, 10.94%) +vma_alloc_folio -[[kernel.kallsyms]] (18,810,377 samples, 0.15%) - +futex_wait (1,739,970 samples, 0.01%) + -__GI_munmap (18,068,150 samples, 0.14%) - +void std::destroy_at<std::pair<unsigned char* const, dsacache::CacheData> > (5,177,974 samples, 0.04%) + -[[kernel.kallsyms]] (1,094,452 samples, 0.01%) - +[libstdc++.so.6.0.32] (7,573,014,148 samples, 56.19%) +[libstdc++.so.6.0.32] -[[kernel.kallsyms]] (1,143,574 samples, 0.01%) - +free_unref_page_prepare (12,087,054 samples, 0.09%) + -dml::core::dispatcher::hw_queue::initialize_new_queue (1,751,881 samples, 0.01%) - +__mem_cgroup_charge (1,735,148 samples, 0.01%) + -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - +void std::allocator_traits<std::allocator<std::__detail::_Hash_node<std::pair<unsigned char* const, dsacache::CacheData>, false> > >::destroy<std::pair<unsigned char* const, dsacache::CacheData> > (5,177,974 samples, 0.04%) + -__mmap64 (1,262,425 samples, 0.01%) - +scheduler_tick (2,568,736 samples, 0.02%) + -accfg_get_param_long (8,019,769 samples, 0.06%) - +qi_submit_sync (2,570,042 samples, 0.02%) + -grow_heap (2,585,739 samples, 0.02%) - +clock_gettime@plt (32,929,155 samples, 0.24%) + -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - +do_syscall_64 (1,739,797 samples, 0.01%) + -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - +__alloc_pages (1,475,070,703 samples, 10.94%) +__alloc_pages -__run_exit_handlers (1,662,976 samples, 0.01%) - +sum_check (350,026,063 samples, 2.60%) +su.. -__sysfs_device_parse (8,897,243 samples, 0.07%) - +kernel_clone (3,468,037 samples, 0.03%) + -__GI__IO_file_doallocate (2,585,739 samples, 0.02%) - +_IO_new_file_underflow (8,469,848 samples, 0.06%) + -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - +qi_submit_sync (10,915,702 samples, 0.08%) + -[libstdc++.so.6.0.32] (3,648,891,641 samples, 28.53%) -[libstdc++.so.6.0.32] +entry_SYSCALL_64_after_hwframe (6,744,844 samples, 0.05%) + -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - +std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::_M_gen_rand (433,839,037 samples, 3.22%) +std.. -grow_heap (25,788,811 samples, 0.20%) - +intel_invalidate_range (23,570,936 samples, 0.17%) + -bool std::__detail::__waiter_base<std::__detail::__waiter_pool>::_M_do_spin<std::__tree_barrier<NopStruct>::wait (1,737,314 samples, 0.01%) - +hrtimer_interrupt (6,036,217 samples, 0.04%) + -std::chrono::_V2::steady_clock::now (48,250,590 samples, 0.38%) - +__x64_sys_mprotect (6,744,844 samples, 0.05%) + -[[kernel.kallsyms]] (25,788,811 samples, 0.20%) - +__sysfs_device_parse (4,519,291 samples, 0.03%) + -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - +scheduler_tick (20,760,710 samples, 0.15%) + -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - +qi_flush_dev_iotlb_pasid (12,655,234 samples, 0.09%) + -[[kernel.kallsyms]] (1,833,688,222 samples, 14.34%) -[[kernel.kallsyms]] +__alloc_pages (4,339,374 samples, 0.03%) + -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - +do_mprotect_pkey (25,295,540 samples, 0.19%) + -[[kernel.kallsyms]] (5,210,636 samples, 0.04%) - +dml::detail::ml::impl::hardware::submit (5,803,116 samples, 0.04%) + -openat (6,243,993 samples, 0.05%) - +__GI___clock_gettime (1,628,054,632 samples, 12.08%) +__GI___clock_gettime -std::__tree_barrier<NopStruct>::arrive (2,605,490 samples, 0.02%) - +hrtimer_interrupt (2,568,736 samples, 0.02%) + -[[kernel.kallsyms]] (2,496,020 samples, 0.02%) - +qi_flush_piotlb (1,679,595 samples, 0.01%) + -unsigned int std::uniform_int_distribution<unsigned long>::_S_nd<unsigned long, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>, unsigned int> (248,928,848 samples, 1.95%) -u.. +perf_adjust_freq_unthr_context (1,704,757 samples, 0.01%) + -std::__new_allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> >::allocate (28,378,674 samples, 0.22%) - +std::barrier<NopStruct>::wait (1,739,970 samples, 0.01%) + -void std::__atomic_wait_address<std::__barrier_phase_t, std::__tree_barrier<NopStruct>::wait (2,605,852 samples, 0.02%) - +clear_page_erms (307,946,959 samples, 2.28%) +c.. -[[kernel.kallsyms]] (3,360,039 samples, 0.03%) - +__rmqueue_pcplist (1,735,652 samples, 0.01%) + -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - +__mmu_notifier_invalidate_range_end (1,679,595 samples, 0.01%) + -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - +inherit_event.isra.0 (2,599,737 samples, 0.02%) + -[[kernel.kallsyms]] (12,950,787 samples, 0.10%) - - - -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - - - -[[kernel.kallsyms]] (2,496,020 samples, 0.02%) - - - -accfg_wq_get_state (1,751,881 samples, 0.01%) - - - -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - - - -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - - - -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - - - -std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>::operator (1,000,135,128 samples, 7.82%) -std::mersen.. - - -[[kernel.kallsyms]] (5,210,636 samples, 0.04%) - - - -[[kernel.kallsyms]] (1,662,976 samples, 0.01%) - - - -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - - - -[[kernel.kallsyms]] (18,068,150 samples, 0.14%) - - - -allocate_stack (2,587,208 samples, 0.02%) - - - -dsacache::Cache::Access (45,447,457 samples, 0.36%) - - - -_dl_start_final (2,239,586 samples, 0.02%) - - - -numa_node_size64 (5,157,234 samples, 0.04%) - - - -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - - - -[[kernel.kallsyms]] (6,243,993 samples, 0.05%) - - - -bool std::__detail::__atomic_spin<std::__tree_barrier<NopStruct>::wait (1,737,314 samples, 0.01%) - - - -[[kernel.kallsyms]] (8,686,189 samples, 0.07%) - - - -__mmap64 (1,262,425 samples, 0.01%) - - - -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - - - -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - - - -std::allocator_traits<std::allocator<dml::detail::ml::utils::structure_from<dml::detail::descriptor, dml::detail::completion_record> > >::allocate (28,378,674 samples, 0.22%) - - - -add_group (3,101,168 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - - - -__GI__dl_catch_exception (1,262,425 samples, 0.01%) - - - -__GI___clock_gettime (70,219,218 samples, 0.55%) - - - -std::barrier<NopStruct>::wait (3,474,594 samples, 0.03%) - - - -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - - - -strncmp (1,132,673 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,106,913 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - - - -add_device (2,238,026 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - - - -all (12,788,431,143 samples, 100%) - - - -[[kernel.kallsyms]] (2,587,208 samples, 0.02%) - - - -_start (4,781,706,256 samples, 37.39%) -_start - - -_mm512_mask_add_epi64 (18,513,417 samples, 0.14%) - - - -_mm512_stream_load_si512 (201,075,858 samples, 1.57%) - - - -_dl_sysdep_start (2,239,586 samples, 0.02%) - - - -std::thread& std::vector<std::thread, std::allocator<std::thread> >::emplace_back<void (2,587,208 samples, 0.02%) - - - -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - - - -wqs_init (8,897,243 samples, 0.07%) - - - -aggr_j (313,529,084 samples, 2.45%) -ag.. - - -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,833,688,222 samples, 14.34%) -[[kernel.kallsyms]] - - -_dl_start (2,239,586 samples, 0.02%) - - - -std::__detail::__waiter_pool::_M_do_wait (1,737,280 samples, 0.01%) - - - -std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > (3,498,307 samples, 0.03%) - - - -_dl_relocate_object (1,106,913 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,735,727 samples, 0.01%) - - - -device_parse (2,238,026 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,737,164 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - - - -accfg_get_param_str (2,040,247 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,510,538,762 samples, 11.81%) -[[kernel.kallsyms]] - - -bool std::__detail::__waiter_base<std::__detail::__waiter_pool>::_S_do_spin<std::__tree_barrier<NopStruct>::wait (1,737,314 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - - - -__GI___libc_read (1,775,776 samples, 0.01%) - - - -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - - - -_dl_open (1,262,425 samples, 0.01%) - - - -__GI__IO_doallocbuf (2,585,739 samples, 0.02%) - - - -[[kernel.kallsyms]] (5,959,122 samples, 0.05%) - - - -start_thread (3,650,629,297 samples, 28.55%) -start_thread - - -[[kernel.kallsyms]] (3,452,194 samples, 0.03%) - - - -auto dml::detail::ml::make_mem_move_task<std::allocator<unsigned char> > (28,378,674 samples, 0.22%) - - - -[[kernel.kallsyms]] (1,262,425 samples, 0.01%) - - - -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - - - -void std::allocator_traits<std::allocator<std::thread> >::construct<std::thread, void (2,587,208 samples, 0.02%) - - - -__GI___close (1,094,452 samples, 0.01%) - - - -void std::__detail::__platform_wait<int> (1,735,727 samples, 0.01%) - - - -void std::vector<std::thread, std::allocator<std::thread> >::_M_realloc_insert<void (2,587,208 samples, 0.02%) - - - -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - - - -accfg_wq_get_first (8,897,243 samples, 0.07%) - - - -dml::detail::ml::buffer<std::allocator<unsigned char>, dml::detail::descriptor, dml::detail::completion_record>::buffer (28,378,674 samples, 0.22%) - - - -[[kernel.kallsyms]] (10,426,718 samples, 0.08%) - - - -__GI___libc_malloc (2,585,739 samples, 0.02%) - - - -_dl_map_object (1,262,425 samples, 0.01%) - - - -[[kernel.kallsyms]] (3,621,354 samples, 0.03%) - - - -void std::__atomic_wait_address<std::__barrier_phase_t, std::__tree_barrier<NopStruct>::wait (1,737,314 samples, 0.01%) - - - -[[kernel.kallsyms]] (1,737,656 samples, 0.01%) - - - -Aggregation<unsigned long, Sum, (296,406,305 samples, 2.32%) -A.. - - -std::__tree_barrier<NopStruct>::wait (3,474,594 samples, 0.03%) - - - -dsa_initialize_accelerator_driver (1,262,425 samples, 0.01%) - - - -[[kernel.kallsyms]] (24,927,309 samples, 0.19%) - - - -[[kernel.kallsyms]] (1,737,280 samples, 0.01%) - - - -decltype (2,587,208 samples, 0.02%) - - - -[[kernel.kallsyms]] (1,562,656 samples, 0.01%) - - - -[[kernel.kallsyms]] (2,585,739 samples, 0.02%) - - - -___dlopen (1,262,425 samples, 0.01%) - - - -[[kernel.kallsyms]] (2,589,863 samples, 0.02%) - +do_syscall_64 (1,679,595 samples, 0.01%) + diff --git a/qdp_project/record-perf.sh b/qdp_project/record-perf.sh index f5be3a1..8f89a15 100644 --- a/qdp_project/record-perf.sh +++ b/qdp_project/record-perf.sh @@ -1,3 +1,3 @@ echo 0 > /proc/sys/kernel/kptr_restrict perf record -e cycles -g --call-graph dwarf -- cmake-build-reldeb/QDPBench -perf script | /home/cfuerst/FlameGraph/stackcollapse-perf.pl | /home/cfuerst/FlameGraph/flamegraph.pl > results/perf.pl +perf script | /home/cfuerst/FlameGraph/stackcollapse-perf.pl | /home/cfuerst/FlameGraph/flamegraph.pl > results/perf.svg