Code for "Energy Considerations of Large Language Model Inference and Efficiency Optimizations" (ACL 2025).
# Install PyTorch and Code Carbon
mamba install pytorch torchvision torchaudio pytorch-cuda=12.4 cudnn -c pytorch -c nvidia
mamba install -c codecarbon codecarbon
# Install flash-attn
mamba install cuda cuda-toolkit=12.4 -c nvidia
mamba install cuda-cudart cuda-version=12 -c nvidia
MAX_JOBS=4 pip install flash-attn --no-build-isolation
# Dependencies
mamba install numpy=1.26 sympy=1.13.1 tokenizers=0.20.4 tiktoken=0.7 transformers=4.46.3
mamba install openmpi timm diffusers mistral-common
mamba install gcc ninja aiohttp ipython librosa pysoundfile
# Install TensorRT for ONNX Runtime
python3 -m pip install tensorrt-cu12 tensorrt-lean-cu12 tensorrt-dispatch-cu12 tensorrt
pip install bitsandbytes
# Untar and place in your project directory
wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz
pip install vllm==0.6.6
# For DeepSpeed Profiler
pip install deepspeed==0.16.2
# orjson (can probably be mamba installed, but haven't explicitly verified)
pip install orjson
# Local install optimum-benchmark from source HEAD@6e6b103
# Submodule setup: cd optimum-benchmark; git submodule init; git submodule update
python setup.py develop easy_install "optimum_benchmark[onnxruntime-gpu,vllm]"
Run experiment
bash run/run_{TASK_NAME}.sh