using cmd below and run several times generate, then the xft will be blocked in oneccl.
OMP_NUM_THREADS=48 LD_PRELOAD=libiomp5.so mpirun -n 1 numactl --physcpubind 16-63 --localalloc python demo.py -t /mnt/data/LLM_Models/Qwen-14B-Chat/ -m /mnt/data/LLM_Models/Qwen-14B-Chat/cpu/ --output_len 512 --dtype bf16_fp16 --do_sample true : -n 1 numactl --physcpubind 80-127 --localalloc python demo.py -t /mnt/data/LLM_Models/Qwen-14B-Chat/ -m /mnt/data/LLM_Models/Qwen-14B-Chat/cpu/ --output_len 512 --dtype bf16_fp16 --do_sample true