layout | title |
---|---|
post |
第38期 |
从reddit/hackernews/lobsters/meetingcpp 摘抄一些c++动态
每周更新
欢迎投稿,推荐或自荐文章/软件/资源等,请提交 issue
accu最近也要也开始了。视频还没放
codedive也要开始了,看不过来了
编译器信息最新动态推荐关注hellogcc公众号 本周更新 Weekly 2021-11-10 第124期
execution的中文资料比较少,c++23最重要的特性,学无止境啊胖友们
AVX512指令导致降频
这是针对旧机型的。新机型最好自己测一下
TODO:怎么测?
c++ weekly的作者2021年出的书,值得一读,有些建议还是有点意思的
之前提到过,std::string的resize是会走一遍构造的,这对于后续操作来说无疑是多余的动作,所以编译器也开洞了一个resize_uninit接口,当然这不是标准的一部分
c++23加了个新的接口,resize_and_overwrite,让你resize的同时指定你重新的逻辑,而不是走默认构造,godbolt体验
std::string resize_and_overwrite(const std::string& str, std::size_t size) {
std::string ret;
const auto step = std::size(str);
ret.resize_and_overwrite(step * size, [&](auto* buf, auto n) {
for (auto i = 0u; i < size; i++) {
std::memcpy(buf + i * step, std::data(str), step);
}
return step * size;
});
return ret;
}
int main(){
std::cout << resize_and_overwrite("quantlab", 4); // prints quantlabquantlabquantlabquantlab
}
Daniel Lemire大神新活,如何转int到字符串最快
写了几种常规操作
最常规
void to_string_backlinear(uint64_t x, char *out) {
for(int z = 0; z < 16; z++) {
out[15-z] = (x % 10) + 0x30;
x /= 10;
}
}
查表
void to_string_tree_table(uint64_t x, char *out) {
static const char table[200] = {
0x30, 0x30, 0x30, 0x31, 0x30, 0x32, 0x30, 0x33, 0x30, 0x34, 0x30, 0x35,
0x30, 0x36, 0x30, 0x37, 0x30, 0x38, 0x30, 0x39, 0x31, 0x30, 0x31, 0x31,
0x31, 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37,
0x31, 0x38, 0x31, 0x39, 0x32, 0x30, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33,
0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, 0x38, 0x32, 0x39,
0x33, 0x30, 0x33, 0x31, 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35,
0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, 0x34, 0x30, 0x34, 0x31,
0x34, 0x32, 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37,
0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x35, 0x31, 0x35, 0x32, 0x35, 0x33,
0x35, 0x34, 0x35, 0x35, 0x35, 0x36, 0x35, 0x37, 0x35, 0x38, 0x35, 0x39,
0x36, 0x30, 0x36, 0x31, 0x36, 0x32, 0x36, 0x33, 0x36, 0x34, 0x36, 0x35,
0x36, 0x36, 0x36, 0x37, 0x36, 0x38, 0x36, 0x39, 0x37, 0x30, 0x37, 0x31,
0x37, 0x32, 0x37, 0x33, 0x37, 0x34, 0x37, 0x35, 0x37, 0x36, 0x37, 0x37,
0x37, 0x38, 0x37, 0x39, 0x38, 0x30, 0x38, 0x31, 0x38, 0x32, 0x38, 0x33,
0x38, 0x34, 0x38, 0x35, 0x38, 0x36, 0x38, 0x37, 0x38, 0x38, 0x38, 0x39,
0x39, 0x30, 0x39, 0x31, 0x39, 0x32, 0x39, 0x33, 0x39, 0x34, 0x39, 0x35,
0x39, 0x36, 0x39, 0x37, 0x39, 0x38, 0x39, 0x39,
};
uint64_t top = x / 100000000;
uint64_t bottom = x % 100000000;
uint64_t toptop = top / 10000;
uint64_t topbottom = top % 10000;
uint64_t bottomtop = bottom / 10000;
uint64_t bottombottom = bottom % 10000;
uint64_t toptoptop = toptop / 100;
uint64_t toptopbottom = toptop % 100;
uint64_t topbottomtop = topbottom / 100;
uint64_t topbottombottom = topbottom % 100;
uint64_t bottomtoptop = bottomtop / 100;
uint64_t bottomtopbottom = bottomtop % 100;
uint64_t bottombottomtop = bottombottom / 100;
uint64_t bottombottombottom = bottombottom % 100;
//
memcpy(out, &table[2 * toptoptop], 2);
memcpy(out + 2, &table[2 * toptopbottom], 2);
memcpy(out + 4, &table[2 * topbottomtop], 2);
memcpy(out + 6, &table[2 * topbottombottom], 2);
memcpy(out + 8, &table[2 * bottomtoptop], 2);
memcpy(out + 10, &table[2 * bottomtopbottom], 2);
memcpy(out + 12, &table[2 * bottombottomtop], 2);
memcpy(out + 14, &table[2 * bottombottombottom], 2);
}
查一个巨大巨大的表
void to_string_tree_bigtable(uint64_t x, char *out) {
#include "bigtable.h"
uint64_t top = x / 100000000;
uint64_t bottom = x % 100000000;
//
uint64_t toptop = top / 10000;
uint64_t topbottom = top % 10000;
uint64_t bottomtop = bottom / 10000;
uint64_t bottombottom = bottom % 10000;
memcpy(out, &bigtable[4 * toptop], 4);
memcpy(out + 4, &bigtable[4 * topbottom], 4);
memcpy(out + 8, &bigtable[4 * bottomtop], 4);
memcpy(out + 12, &bigtable[4 * bottombottom], 4);
}
SIMD方法参考这个
#ifdef __SSE2__
// mula
#include <x86intrin.h>
void to_string_sse2(uint64_t v, char *out) {
// v is 16-digit number = abcdefghijklmnop
const __m128i div_10000 = _mm_set1_epi32(0xd1b71759);
const __m128i mul_10000 = _mm_set1_epi32(10000);
const int div_10000_shift = 45;
const __m128i div_100 = _mm_set1_epi16(0x147b);
const __m128i mul_100 = _mm_set1_epi16(100);
const int div_100_shift = 3;
const __m128i div_10 = _mm_set1_epi16(0x199a);
const __m128i mul_10 = _mm_set1_epi16(10);
const __m128i ascii0 = _mm_set1_epi8('0');
// can't be easliy done in SSE
const uint32_t a = v / 100000000; // 8-digit number: abcdefgh
const uint32_t b = v % 100000000; // 8-digit number: ijklmnop
// [ 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1
// | 0 ]
// x = [ 0 | ijklmnop | 0 | abcdefgh ]
__m128i x = _mm_set_epi64x(b, a);
// x div 10^4 = [ 0 | ijkl | 0 | abcd ]
__m128i x_div_10000;
x_div_10000 = _mm_mul_epu32(x, div_10000);
x_div_10000 = _mm_srli_epi64(x_div_10000, div_10000_shift);
// x mod 10^4 = [ 0 | mnop | 0 | efgh ]
__m128i x_mod_10000;
x_mod_10000 = _mm_mul_epu32(x_div_10000, mul_10000);
x_mod_10000 = _mm_sub_epi32(x, x_mod_10000);
// y = [ mnop | ijkl | efgh | abcd ]
__m128i y = _mm_or_si128(x_div_10000, _mm_slli_epi64(x_mod_10000, 32));
// y_div_100 = [ 0 | mn | 0 | ij | 0 | ef | 0 | ab
// ]
__m128i y_div_100;
y_div_100 = _mm_mulhi_epu16(y, div_100);
y_div_100 = _mm_srli_epi16(y_div_100, div_100_shift);
// y_mod_100 = [ 0 | op | 0 | kl | 0 | gh | 0 | cd
// ]
__m128i y_mod_100;
y_mod_100 = _mm_mullo_epi16(y_div_100, mul_100);
y_mod_100 = _mm_sub_epi16(y, y_mod_100);
// z = [ mn | op | ij | kl | ef | gh | ab | cd
// ]
__m128i z = _mm_or_si128(y_div_100, _mm_slli_epi32(y_mod_100, 16));
// z_div_10 = [ 0 | m | 0 | o | 0 | i | 0 | k | 0 | e | 0 | g | 0 | a | 0
// | c ]
__m128i z_div_10 = _mm_mulhi_epu16(z, div_10);
// z_mod_10 = [ 0 | n | 0 | p | 0 | j | 0 | l | 0 | f | 0 | h | 0 | b | 0
// | d ]
__m128i z_mod_10;
z_mod_10 = _mm_mullo_epi16(z_div_10, mul_10);
z_mod_10 = _mm_sub_epi16(z, z_mod_10);
// interleave z_mod_10 and z_div_10 -
// tmp = [ m | n | o | p | i | j | k | l | e | f | g | h | a | b | c
// | d ]
__m128i tmp = _mm_or_si128(z_div_10, _mm_slli_epi16(z_mod_10, 8));
// convert to ascii
tmp = _mm_add_epi8(tmp, ascii0);
// and save result
_mm_storeu_si128((__m128i *)out, tmp);
}
#endif
如果你用不了SIMD,有个Packed SIMD方案,又叫做SIMD within a register (SWAR),,原理看这个贴
// credit: Paul Khuong
uint64_t encode_ten_thousands(uint64_t hi, uint64_t lo) {
uint64_t merged = hi | (lo << 32);
/* Truncate division by 100: 10486 / 2**20 ~= 1/100. */
uint64_t top = ((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL);
/* Trailing 2 digits in the 1e4 chunks. */
uint64_t bot = merged - 100ULL * top;
uint64_t hundreds;
uint64_t tens;
/*
* We now have 4 radix-100 digits in little-endian order, each
* in its own 16 bit area.
*/
hundreds = (bot << 16) + top;
/* Divide and mod by 10 all 4 radix-100 digits in parallel. */
tens = (hundreds * 103ULL) >> 10;
tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL;
tens += (hundreds - 10ULL * tens) << 8;
return tens;
}
void to_string_khuong(uint64_t x, char *out) {
uint64_t top = x / 100000000;
uint64_t bottom = x % 100000000;
uint64_t first =
0x3030303030303030 + encode_ten_thousands(top / 10000, top % 10000);
memcpy(out, &first, sizeof(first));
uint64_t second =
0x3030303030303030 + encode_ten_thousands(bottom / 10000, bottom % 10000);
memcpy(out + 8, &second, sizeof(second));
}
简单bench,查表无敌,压测代码在这里
PS
内核中用哪个实现?
/* Based on code by Douglas W. Jones found at
* <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
* (with permission from the author).
* Performs no 64-bit division and hence should be fast on 32-bit machines.
*/
static
char *put_dec(char *buf, unsigned long long n)
{
uint32_t d3, d2, d1, q, h;
if (n < 100*1000*1000)
return put_dec_trunc8(buf, n);
d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
h = (n >> 32);
d2 = (h ) & 0xffff;
d3 = (h >> 16); /* implicit "& 0xffff" */
/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
= 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
q = put_dec_helper4(buf, q);
q += 7671 * d3 + 9496 * d2 + 6 * d1;
q = put_dec_helper4(buf+4, q);
q += 4749 * d3 + 42 * d2;
q = put_dec_helper4(buf+8, q);
q += 281 * d3;
buf += 12;
if (q)
buf = put_dec_trunc8(buf, q);
else while (buf[-1] == '0')
--buf;
return buf;
}
另外多说一句,从字符串转成数字,最快的是std::from_chars, gcc8之后可用
simd和swar的实现说实话得研究研究才看得懂。暂时标记个TODO后面看吧
讨论了一下菱形继承下的内存布局
也说过很多次了,这个-fast-math有很多优化,可能会引入bug
In GCC,
-ffast-math
(or-Ofast
) enables the following options:-fno-math-errno
,-funsafe-math-optimizations
,-ffinite-math-only
,-fno-rounding-math
,-fno-signaling-nans
,-fcx-limited-range
and-fexcess-precision=fast
. Note that-funsafe-math-optimizations
is itself a collection of options-fno-signed-zeros
,-fno-trapping-math
,-fassociative-math
and-freciprocal-math
, plus some extra ones, which we will discuss further below.
比如-fno-math-errno 会引起malloc不报错
比如-fno-signaling-nans会导致isnan函数直接失效
你一定要知道你的场景里会不会有这些判断,不然没有意义
目前c++还是没有TU级别的优化限定的。
要么所有都都优化要么都不优化,不能这个o优化那个o不优化,所以还是要注意优化项的使用
手把手教你看spinlock汇编
首先回顾一下Heterogeneous Lookup 的概念
一般来说容器的find/contains只能查相同的类型,就会有这种坑爹的场景
std::map<std::string, int> m = ...;
absl::string_view some_key = ...;
// Construct a temporary `std::string` to do the query.
// The allocation + copy + deallocation might dominate the find() call.
auto it = m.find(std::string(some_key));
白白多了一个复制,如何避免这种浪费呢,引入Heterogeneous Lookup,具体的做法就是定制一个transparent 比较器
struct StringCmp {
using is_transparent = void;
bool operator()(absl::string_view a, absl::string_view b) const {
return a < b;
}
};
std::map<std::string, int, StringCmp> m = ...;
absl::string_view some_key = ...;
// The comparator `StringCmp` will accept any type that is implicitly
// convertible to `absl::string_view` and says so by declaring the
// `is_transparent` tag.
// We can pass `some_key` to `find()` without converting it first to
// `std::string`. In this case, that avoids the unnecessary memory allocation
// required to construct the `std::string` instance.
auto it = m.find(some_key);
再来看这篇文章的例子 @Compiler Explorer
#include <iostream>
#include <map>
#include <string>
#include <functional>
#include <string_view>
#include <set>
// simple new/delete overloads, so we can check if some memory was allocated...
void* operator new(std::size_t sz){
std::cout << "Allocating: " << sz << '\n';
return std::malloc(sz);
}
void operator delete(void* ptr) noexcept{
std::free(ptr);
}
int main()
{
std::map<std::string, int> intMap { { "Hello Super Long String", 1 }, { "Another Longish String", 2 }, {"This cannot fall into SSO buffer", 3 }};
std::map<std::string, int, std::less<>> trIntMap { { "Hello Super Long String", 1 }, { "Another Longish String", 2 }, {"This cannot fall into SSO buffer", 3 }};
std::cout << "Lookup in intMap with by const char*:\n";
std::cout << intMap.contains("Hello Super Long String") << '\n';
/*std::cout << "Lookup in intMap with string_view:\n";
std::string_view sv("Another Longish String");
std::cout << intMap.contains(sv) << '\n';*/
std::cout << "Lookup in trIntMap by const char*: \n";
std::cout << trIntMap.contains("Hello Super Long String") << '\n';
std::cout << "Lookup in trIntMap by string_view: \n";
std::string_view sv2("Another Longish String");
std::cout << trIntMap.contains(sv2) << '\n';
}
能看到trIntMap没有多余的内存分配,std::less<>和std::less<std::string>差别这么大?
其实是特化了 transparent比较
template <>
struct less<void> {
using is_transparent = int;
// simplified version...
template <class _Ty1, class _Ty2>
constexpr auto operator()(_Ty1&& _Left, _Ty2&& _Right) const
return static_cast<_Ty1&&>(_Left) < static_cast<_Ty2&&>(_Right);
}
};
这个带来的性能提升(省拷贝)还是很可观的,但是会引入转换问题,一定要注意
另外就是这个是c++14引入的,(void做模版参数)可能旧的库使用上会有ABI问题
简单说execution是板上钉钉了。一堆range相关的,network ts没影。(据说两者冲突,总之ASIO又没进)
一个分布式计算框架
教你用python分析一下二进制ELF机器码之类的,有空可以玩玩
介绍abidiff的,挺有意思
- Making Iterators, Views and Containers Easier to Write with Boost.STLInterfaces - Zach Laine CppCon 介绍 Boost.STLInterfaces的,一个接口类库
介绍手机硬件用c++的一些经验,比如不用多线程之类的费电的的操作,这个人的博客google权重挺高的,很容易搜到,可以看看
介绍QAnyStringView 的,qt string相关的东西太多了,qstingview不太行,又加个qanystringview,更牛逼一点
ip地址很容易和UDL结合,比如"192.168.0.1"_ipv4
那么,如何保证这个字符串是合法 的ip呢,如何用UDL来校验ip合法性,作者给了段代码。编译期判定。很有趣, godbolt体验
// With friendly permission of BENOCS GmbH (www.benocs.com)
#include <cstdio> // Needed for the print-function only!
#include <cstdint>
#include <variant>
#include <system_error>
/// Simplified IPv4 address type
struct IPv4Address
{
std::uint32_t addr = 0;
};
/// Parses and extracts a decimal number from the given string
constexpr auto extract_dec_number(const char* str, unsigned long length,
unsigned long& start_index,
std::uint64_t max)
noexcept
-> std::variant<std::uint64_t, std::errc>
{
constexpr auto is_digit = [](char c) { return '0' <= c && c <= '9'; };
constexpr auto get_digit = [](char c) -> std::uint8_t { return c - '0'; };
// Parse number.
std::uint64_t num = 0; // Will hold the parsed number.
auto old = num; // For detecting overflow.
auto index = start_index;
for (char c = 0; index < length && num <= max && old <= num; ++index)
{
c = str[index];
if (! is_digit(c)) break;
old = num;
num = num * 10 + get_digit(c);
}
// No digit parsed at all?
if (start_index == index)
return std::errc::result_out_of_range;
// Parsed number is too large?
if (num > max)
return std::errc::value_too_large;
// Overflow occurred?
if (old > num)
return std::errc::value_too_large;
// Return new index and extracted number.
start_index = index;
return num;
}
/// Parses and extracts an IPv4 address from the given string
#if __cpp_consteval >= 201811
consteval
#else
constexpr
#endif
auto extract_ipv4_address(const char* str, unsigned long length,
unsigned long& start_index)
-> std::uint32_t
{
constexpr auto is_dot = [](char c) { return '.' == c; };
constexpr auto is_digit = [](char c) { return '0' <= c && c <= '9'; };
// Short-circuit if nothing to parse.
if (start_index == length)
throw "Invalid IPv4 address. [Reason: Empty string-literal]";
// Starts with something different than a digit?
if (! is_digit( str[start_index] ))
throw "Invalid IPv4 address. [Reason: Illegal token]";
std::uint32_t addr = 0; // Will hold the parsed IPv4 address.
auto index = start_index;
unsigned long group_count = 0;
for (; group_count < 4 && index < length; ++group_count)
{
// Groups not separated by '.'?
if (group_count != 0 && ! is_dot( str[index++] ))
throw "Invalid IPv4 address. [Reason: Illegal token]";
// Parse next group.
auto old_index = index;
auto group = extract_dec_number(str, length, index, 255u);
// Parse error?
if (std::holds_alternative<std::errc>(group))
{
switch (std::get<std::errc>(group))
{
case std::errc::value_too_large:
throw "Invalid IPv4 address. [Reason: Invalid number in group]";
case std::errc::result_out_of_range:
throw "Invalid IPv4 address. [Reason: Missing number in group]";
default:
throw "Invalid IPv4 address.";
}
}
if (index - old_index > 3) // Maximal 3 digits per group!
throw "Invalid IPv4 address. [Reason: Too many leading zeros in group]";
// Store group.
addr = (addr << 8) | static_cast<std::uint32_t>(std::get<0>(group));
}
// Parsed all 4 groups of the IPv4 address?
if (group_count != 4)
throw "Invalid IPv4 address. [Reason: Too few groups]";
// Return new index and extracted IPv4 address.
start_index = index;
return addr;
}
/// A literal operator for user-defined literal representing an IPv4 address type
#if __cpp_consteval >= 201811
consteval
#else
constexpr
#endif
IPv4Address operator "" _ipv4(const char* str, std::size_t length)
{
unsigned long index = 0;
auto addr = extract_ipv4_address(str, length, index);
if (index != length)
throw "Invalid IPv4 address. [Reason: Additional tokens at end]";
return IPv4Address{ addr };
}
void print(IPv4Address ip)
{
std::printf( "%d.%d.%d.%d\n", ((ip.addr >> 24) & 0xff)
, ((ip.addr >> 16) & 0xff)
, ((ip.addr >> 8) & 0xff)
, ((ip.addr >> 0) & 0xff) );
}
int main()
{
print( "192.168.0.1"_ipv4 );
try {
print( "192.168.0.256"_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
try {
print( "192.168.0."_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
try {
print( "192.168.0"_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
try {
print( "192.168.0.1.123"_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
try {
print( ""_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
try {
print( "192.0168.0.1"_ipv4 );
}
catch(const char* error_msg) { std::printf("%s\n", error_msg); }
// Important:
// In order to guarantee compile-time evaluation of constexpr, it
// must be called from a compile-time context, e.g. a static_assert!
// Note:
// 192 168 0 1
// <==> 0xc0 0xa8 0x0 0x1
/*
static_assert(("192.168.0.1"_ipv4).addr == 0xc0a80001);
static_assert(("192.168.0.256"_ipv4).addr == 0xc0a80001);
static_assert(("192.168.0."_ipv4).addr == 0xc0a80001);
static_assert(("192.168.0"_ipv4).addr == 0xc0a80001);
static_assert(("192.168.0.1.123"_ipv4).addr == 0xc0a80001);
static_assert((""_ipv4).addr == 0x0);
static_assert(("192.0168.0.1"_ipv4).addr == 0xc0a80001);
*/
return 0;
}
先假定你懂execution的概念,sender receiver, operate state = connect, start(op), execution context, scheduler, 如果不懂,看第二个文章
execution提案主要作者eric niebler(也是range-v3作者,提案推动,所以有点api设计是相似的),再讲一遍对应的概念
讲程序设计,以及execution的设计,让代码更好维护/组织,异步更异步,异步也能取消
有个列表 简单看了一下,比较杂,不是特别c++,就不总结了。等2021出了有意思的,再说
- hackingcpp网站更新了profile工具列表
- Refureku 一个反射库
- ces 协程+epoll/socket,玩出花样
- thread-pool 一个c++20的线程池实现 非常简单,mutex + condvar, 用到了require所以是c++20的