Performance regression since 10.0.0

We saw a performance regression of a factor of 2-3 when switching from fmt version 8.1.0 to 10.0.0 (also happens with 10.1.1).

I extracted a MWE from our code:
```
cmake_minimum_required (VERSION 3.20)
project (Test)

set(CMAKE_CXX_STANDARD 17)

add_executable(prog main.cpp)

include(FetchContent)
FetchContent_Declare(fmt
        GIT_REPOSITORY https://github.com/fmtlib/fmt.git
        GIT_TAG 8.1.0
        GIT_SHALLOW TRUE
        QUIET

        set (FMT_TEST OFF CACHE BOOL "" FORCE)
        set (FMT_DOC OFF CACHE BOOL "" FORCE)
        set (FMT_INSTALL OFF CACHE BOOL "" FORCE)
        set (FMT_SYSTEM_HEADERS ON CACHE BOOL "" FORCE)
        )
FetchContent_MakeAvailable(fmt)
set_property(TARGET fmt PROPERTY POSITION_INDEPENDENT_CODE ON)
add_dependencies(prog fmt)
target_include_directories(prog PUBLIC ${fmt_SOURCE_DIR}/include)
target_link_libraries(prog PUBLIC fmt::fmt)

find_package(OpenMP REQUIRED)
target_compile_options(prog PUBLIC -fopenmp)
target_link_libraries(prog PUBLIC -fopenmp)
```
```cpp
#include "fmt/compile.h"
#include "fmt/format.h"
#include "fmt/chrono.h"
#include "fmt/os.h"

#include "omp.h"

#include <vector>
#include <random>
#include <string>

using real_type = double;

template <typename T>
using matrix = std::vector<std::vector<T>>;

template <typename label_type>
inline void write(const std::string &filename, const std::vector<label_type> &labels, const matrix<real_type> &support_vectors) {
    
    const std::size_t num_features = support_vectors.front().size();
    const std::size_t num_classes = 2;
    const std::size_t num_alpha_per_point = num_classes;
    const std::vector<label_type> label_order = { 0, 1 };

    // create file
    fmt::ostream out = fmt::output_file(filename);
    // write timestamp for current date time
    // note: commented out since the resulting model file cannot be read be LIBSVM
    //    out.print("# This model file has been created at {}\n", detail::current_date_time());

    // the maximum size of one formatted LIBSVM entry, e.g., 1234:1.365363e+10
    // biggest number representable as std::size_t: 18446744073709551615 -> 20 chars
    // scientific notation: 3 chars (number in front of decimal separator including a sign + decimal separator) + 10 chars (part after the decimal separator, specified during formatting) +
    //                      5 chars exponent (e + sign + maximum potential exponent (308 -> 3 digits)
    // separators: 2 chars (: between index and feature + whitespace after feature value)
    // -> 40 chars in total
    // -> increased to 48 chars to be on the safe side
    static constexpr std::size_t CHARS_PER_BLOCK = 48;
    // results in 48 B * 128 B = 6 KiB stack buffer per thread
    static constexpr std::size_t BLOCK_SIZE = 128;
    // use 1 MiB as buffer per thread
    constexpr std::size_t STRING_BUFFER_SIZE = 1024 * 1024;

    // format one output-line
    auto format_libsvm_line = [](std::string &output, const std::vector<real_type> &a, const matrix<real_type> &d, const std::size_t point) {
        static constexpr std::size_t STACK_BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
        static char buffer[STACK_BUFFER_SIZE];
        #pragma omp threadprivate(buffer)

        output.append(fmt::format("{:.10e} ", fmt::join(a, " ")));
        for (typename std::vector<real_type>::size_type j = 0; j < d.front().size(); j += BLOCK_SIZE) {
            char *ptr = buffer;
            for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, d.front().size() - j); ++i) {
                if (d[point][j + i] != real_type{ 0.0 }) {
                    // add 1 to the index since LIBSVM assumes 1-based feature indexing
                    ptr = fmt::format_to(ptr, FMT_COMPILE("{}:{:.10e} "), j + i + 1, d[point][j + i]);
                }
            }
            output.append(buffer, ptr - buffer);
        }
        output.push_back('\n');
    };

    // initialize volatile array
    auto counts = std::make_unique<volatile int[]>(label_order.size() + 1);
    counts[0] = std::numeric_limits<int>::max();
    #pragma omp parallel default(none) shared(counts, format_libsvm_line, label_order, labels, support_vectors, out) firstprivate(BLOCK_SIZE, CHARS_PER_BLOCK, num_features, num_classes, num_alpha_per_point)
    {
        // preallocate string buffer, only ONE allocation
        std::string out_string;
        out_string.reserve(STRING_BUFFER_SIZE + (num_features + num_alpha_per_point) * CHARS_PER_BLOCK);  // oversubscribe buffer that at least one additional line fits into it
        std::vector<real_type> alpha_per_point(num_alpha_per_point);

        // loop over all classes, since they must be sorted
        for (typename std::vector<label_type>::size_type l = 0; l < label_order.size(); ++l) {
            // the support vectors with the l-th class
            #pragma omp for nowait
            for (typename std::vector<real_type>::size_type i = 0; i < support_vectors.size(); ++i) {
                if (labels[i] == label_order[l]) {
                    format_libsvm_line(out_string, alpha_per_point, support_vectors, i);

                    // if the buffer is full, write it to the file
                    if (out_string.size() > STRING_BUFFER_SIZE) {
                        // wait for all threads to write support vectors for previous class
#ifdef _OPENMP
                        while (counts[l] < omp_get_num_threads()) {
                        }
#endif
                        #pragma omp critical
                        {
                            out.print("{}", out_string);
                        }
                        // clear buffer
                        out_string.clear();
                    }
                }
            }
            // wait for all threads to write support vectors for previous class
#ifdef _OPENMP
            while (counts[l] < omp_get_num_threads()) {
            }
#endif

            #pragma omp critical
            {
                if (!out_string.empty()) {
                    out.print("{}", out_string);
                    out_string.clear();
                }
                counts[l + 1] = counts[l + 1] + 1;
            }
        }
    }
}



int main() {

  std::random_device rd;
  std::mt19937 mt(rd());
  std::uniform_real_distribution<double> dist(-1.0, 1.0);

  std::vector<int> labels(73257);
  matrix<real_type> support_vectors(73257, std::vector<real_type>(3072));

  for (std::size_t i = 0; i < labels.size(); ++i) {
    labels[i] = i % 2;
    for (std::size_t j = 0; j < support_vectors.front().size(); ++j) {
      support_vectors[i][j] = dist(mt);
    }
  }


  auto start = std::chrono::steady_clock::now();
  write("test.libsvm", labels, support_vectors);
  auto end = std::chrono::steady_clock::now();
  fmt::print("{}\n", std::chrono::duration_cast<std::chrono::milliseconds>(end - start));

  return 0;
}
```

I compiled the above code with g++ 11.4.0: `cmake -DCMAKE_BUILD_TYPE=Release .. && make`

On our machine (AMD EPYC 7763 64-Core Processor) I get the following runtimes for the MWE writing to a SSD:
8.1.0: 4067ms
9.1.0: 4093ms
10.0.0: 12954ms
10.1.1: 13009ms

Am I missing something or shouldn't there be such a loss in performance simply by using a newer {fmt} version?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Performance regression since 10.0.0 #3674

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Performance regression since 10.0.0 #3674

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions