Closed
Description
We saw a performance regression of a factor of 2-3 when switching from fmt version 8.1.0 to 10.0.0 (also happens with 10.1.1).
I extracted a MWE from our code:
cmake_minimum_required (VERSION 3.20)
project (Test)
set(CMAKE_CXX_STANDARD 17)
add_executable(prog main.cpp)
include(FetchContent)
FetchContent_Declare(fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt.git
GIT_TAG 8.1.0
GIT_SHALLOW TRUE
QUIET
set (FMT_TEST OFF CACHE BOOL "" FORCE)
set (FMT_DOC OFF CACHE BOOL "" FORCE)
set (FMT_INSTALL OFF CACHE BOOL "" FORCE)
set (FMT_SYSTEM_HEADERS ON CACHE BOOL "" FORCE)
)
FetchContent_MakeAvailable(fmt)
set_property(TARGET fmt PROPERTY POSITION_INDEPENDENT_CODE ON)
add_dependencies(prog fmt)
target_include_directories(prog PUBLIC ${fmt_SOURCE_DIR}/include)
target_link_libraries(prog PUBLIC fmt::fmt)
find_package(OpenMP REQUIRED)
target_compile_options(prog PUBLIC -fopenmp)
target_link_libraries(prog PUBLIC -fopenmp)
#include "fmt/compile.h"
#include "fmt/format.h"
#include "fmt/chrono.h"
#include "fmt/os.h"
#include "omp.h"
#include <vector>
#include <random>
#include <string>
using real_type = double;
template <typename T>
using matrix = std::vector<std::vector<T>>;
template <typename label_type>
inline void write(const std::string &filename, const std::vector<label_type> &labels, const matrix<real_type> &support_vectors) {
const std::size_t num_features = support_vectors.front().size();
const std::size_t num_classes = 2;
const std::size_t num_alpha_per_point = num_classes;
const std::vector<label_type> label_order = { 0, 1 };
// create file
fmt::ostream out = fmt::output_file(filename);
// write timestamp for current date time
// note: commented out since the resulting model file cannot be read be LIBSVM
// out.print("# This model file has been created at {}\n", detail::current_date_time());
// the maximum size of one formatted LIBSVM entry, e.g., 1234:1.365363e+10
// biggest number representable as std::size_t: 18446744073709551615 -> 20 chars
// scientific notation: 3 chars (number in front of decimal separator including a sign + decimal separator) + 10 chars (part after the decimal separator, specified during formatting) +
// 5 chars exponent (e + sign + maximum potential exponent (308 -> 3 digits)
// separators: 2 chars (: between index and feature + whitespace after feature value)
// -> 40 chars in total
// -> increased to 48 chars to be on the safe side
static constexpr std::size_t CHARS_PER_BLOCK = 48;
// results in 48 B * 128 B = 6 KiB stack buffer per thread
static constexpr std::size_t BLOCK_SIZE = 128;
// use 1 MiB as buffer per thread
constexpr std::size_t STRING_BUFFER_SIZE = 1024 * 1024;
// format one output-line
auto format_libsvm_line = [](std::string &output, const std::vector<real_type> &a, const matrix<real_type> &d, const std::size_t point) {
static constexpr std::size_t STACK_BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
static char buffer[STACK_BUFFER_SIZE];
#pragma omp threadprivate(buffer)
output.append(fmt::format("{:.10e} ", fmt::join(a, " ")));
for (typename std::vector<real_type>::size_type j = 0; j < d.front().size(); j += BLOCK_SIZE) {
char *ptr = buffer;
for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, d.front().size() - j); ++i) {
if (d[point][j + i] != real_type{ 0.0 }) {
// add 1 to the index since LIBSVM assumes 1-based feature indexing
ptr = fmt::format_to(ptr, FMT_COMPILE("{}:{:.10e} "), j + i + 1, d[point][j + i]);
}
}
output.append(buffer, ptr - buffer);
}
output.push_back('\n');
};
// initialize volatile array
auto counts = std::make_unique<volatile int[]>(label_order.size() + 1);
counts[0] = std::numeric_limits<int>::max();
#pragma omp parallel default(none) shared(counts, format_libsvm_line, label_order, labels, support_vectors, out) firstprivate(BLOCK_SIZE, CHARS_PER_BLOCK, num_features, num_classes, num_alpha_per_point)
{
// preallocate string buffer, only ONE allocation
std::string out_string;
out_string.reserve(STRING_BUFFER_SIZE + (num_features + num_alpha_per_point) * CHARS_PER_BLOCK); // oversubscribe buffer that at least one additional line fits into it
std::vector<real_type> alpha_per_point(num_alpha_per_point);
// loop over all classes, since they must be sorted
for (typename std::vector<label_type>::size_type l = 0; l < label_order.size(); ++l) {
// the support vectors with the l-th class
#pragma omp for nowait
for (typename std::vector<real_type>::size_type i = 0; i < support_vectors.size(); ++i) {
if (labels[i] == label_order[l]) {
format_libsvm_line(out_string, alpha_per_point, support_vectors, i);
// if the buffer is full, write it to the file
if (out_string.size() > STRING_BUFFER_SIZE) {
// wait for all threads to write support vectors for previous class
#ifdef _OPENMP
while (counts[l] < omp_get_num_threads()) {
}
#endif
#pragma omp critical
{
out.print("{}", out_string);
}
// clear buffer
out_string.clear();
}
}
}
// wait for all threads to write support vectors for previous class
#ifdef _OPENMP
while (counts[l] < omp_get_num_threads()) {
}
#endif
#pragma omp critical
{
if (!out_string.empty()) {
out.print("{}", out_string);
out_string.clear();
}
counts[l + 1] = counts[l + 1] + 1;
}
}
}
}
int main() {
std::random_device rd;
std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(-1.0, 1.0);
std::vector<int> labels(73257);
matrix<real_type> support_vectors(73257, std::vector<real_type>(3072));
for (std::size_t i = 0; i < labels.size(); ++i) {
labels[i] = i % 2;
for (std::size_t j = 0; j < support_vectors.front().size(); ++j) {
support_vectors[i][j] = dist(mt);
}
}
auto start = std::chrono::steady_clock::now();
write("test.libsvm", labels, support_vectors);
auto end = std::chrono::steady_clock::now();
fmt::print("{}\n", std::chrono::duration_cast<std::chrono::milliseconds>(end - start));
return 0;
}
I compiled the above code with g++ 11.4.0: cmake -DCMAKE_BUILD_TYPE=Release .. && make
On our machine (AMD EPYC 7763 64-Core Processor) I get the following runtimes for the MWE writing to a SSD:
8.1.0: 4067ms
9.1.0: 4093ms
10.0.0: 12954ms
10.1.1: 13009ms
Am I missing something or shouldn't there be such a loss in performance simply by using a newer {fmt} version?
Metadata
Metadata
Assignees
Labels
No labels