Description
Module:
Description
A certain profiling call stack caught my eye and the final report from my profiler said 8% of all cpu time of perl is spent inside. isupper()
/toupper()
from ucrtbase.dll, these are floating between place 4- place 8 as highest CPU hogs on random core .t'es. upper() Reaching # 1 was jaw dropping. Hence I investigated.
some research this is 1 call about 1 U8 BTW, ::LocaleUpdate has 6 FlsGetValue calls (wraped with glerr preserving), toupper() fires::LocaleUpdate() every time, errorno in ucrt added another 4-5 FLSGV calls __acrt_LCMapStringA�() fires ::LocaleUpdate again ,
KernelBase.dll!LCMapStringEx�() Unknown
kernel32.dll!LCMapStringExStub�() Unknow
ucrtbase.dll!__acrt_LCMapStringA_stat() Unknown
ucrtbase.dll!__acrt_LCMapStringA�() Unknown
ucrtbase.dll!toupper�() Unknown
soon after
KernelBase.dll!DefaultSortVersion() Unknown
KernelBase.dll!VersionValue�() Unknown
KernelBase.dll!SortChangeCase�() Unknown
KernelBase.dll!LCMapStringEx�() Unknown
a few cpu ins addrs later (remember lines of code have loops)
MinSortChangeCase�()
KernelBase.dll!GetNamedLocaleHashNode() Unknown
KernelBase.dll!SortChangeCase�() Unknown
KernelBase.dll!LCMapStringEx�() Unknown
kernel32.dll!LCMapStringExStub�() Unknown
ucrtbase.dll!__acrt_LCMapStringA_stat() Unknown
ucrtbase.dll!__acrt_LCMapStringA�() Unknown
ucrtbase.dll!toupper�() Unknown
kernelbase.dll tries building a tree of nodes or iterating all country codes on earth, data being searched by KernelBase.dll!GetNamedLocaleHashNode looks like
but this is raw memory with unprintables regexped out, i think its country codes but im not going rev eng it
benchmarks its horrible
C:\sources\crtslow\CRTSlow>perl -Mblib -MCRTSlow -e"CRT::Be();";
cache wake 78063695 us Ln 337
tolower 2440267443 us Ln 341
_tolower 18608920 us Ln 345
toLOWER_A 17415609 us Ln 349
toLOWER_L1 17737359 us Ln 353
isgraph 2562596668 us Ln 357
isGRAPH_A 18712061 us Ln 361
isGRAPH_L1 17790779 us Ln 365
isalnum 2520004815 us Ln 369
isALPHANUMERIC_A 18165949 us Ln 373
isALPHANUMERIC_L1 18033632 us Ln 377
isalnum msvcrt 111663798 us Ln 392
isalnum msvcr100 98945415 us Ln 402
isalnum msvcr120 97687178 us Ln 412
C:\sources\crtslow\CRTSlow>
with psudo threads 3 cores, idk enough if this is scaling or lock contention perl side or ms side is happening
C:\sources\crtslow\CRTSlow>perl -Mblib -MCRTSlow -e"$r = fork(); $t = fork(); ex
it if !$t && $r; CRT::Be();";
cache wake 32082575 us Ln 337
cache wake 33263148 us Ln 337
cache wake 40918586 us Ln 337
tolower 3157700393 us Ln 341
tolower 3165074351 us Ln 341
_tolower 22321171 us Ln 345
_tolower 20705845 us Ln 345
toLOWER_A 22430065 us Ln 349
toLOWER_A 20589965 us Ln 349
toLOWER_L1 22076263 us Ln 353
toLOWER_L1 23926635 us Ln 353
tolower 3348870216 us Ln 341
_tolower 23873216 us Ln 345
toLOWER_A 24134972 us Ln 349
toLOWER_L1 23457776 us Ln 353
isgraph 3397327541 us Ln 357
isGRAPH_A 20434638 us Ln 361
isGRAPH_L1 24083196 us Ln 365
isgraph 3679713786 us Ln 357
isgraph 3694650315 us Ln 357
isGRAPH_A 24393851 us Ln 361
isGRAPH_A 25253907 us Ln 361
isGRAPH_L1 18637274 us Ln 365
isGRAPH_L1 23057951 us Ln 365
isalnum 3123988931 us Ln 369
isALPHANUMERIC_A 23420793 us Ln 373
isALPHANUMERIC_L1 20040976 us Ln 377
isalnum 3220601143 us Ln 369
isalnum 3333259367 us Ln 369
isALPHANUMERIC_A 26456259 us Ln 373
isALPHANUMERIC_A 25220212 us Ln 373
isALPHANUMERIC_L1 19804287 us Ln 377
isALPHANUMERIC_L1 26882794 us Ln 377
isalnum msvcrt 144513972 us Ln 392
isalnum msvcrt 151367705 us Ln 392
isalnum msvcrt 151673430 us Ln 392
isalnum msvcr100 133158199 us Ln 402
isalnum msvcr100 124220690 us Ln 402
isalnum msvcr100 123605543 us Ln 402
isalnum msvcr120 129325137 us Ln 412
isalnum msvcr120 118722581 us Ln 412
isalnum msvcr120 121761334 us Ln 412
C:\sources\crtslow\CRTSlow>
Steps to Reproduce
#define PERL_NO_GET_CONTEXT
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
/* Global Data */
LARGE_INTEGER Frequency = { 0 };
#define g_Frequency Frequency
START_MY_CXT
/* BTIME = BENCH TIME*/
#define BTIMESTART do { \
LARGE_INTEGER StartingTime, EndingTime, ElapsedMicroseconds; NV nv1; NV nv2; \
QueryPerformanceCounter(&StartingTime)
#define BTIMEEND(label) \
QueryPerformanceCounter(&EndingTime); \
ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart; \
ElapsedMicroseconds.QuadPart *= 1000000000; \
ElapsedMicroseconds.QuadPart = ElapsedMicroseconds.QuadPart \
/((LARGE_INTEGER*)(&Frequency))->QuadPart; \
printf("%-30s %10I64u us Ln %u\n", label, ElapsedMicroseconds.QuadPart, __LINE__); \
} while(0)
#define VP(_vp) ((size_t)(_vp))
#define CP(_cp) ((char *)(_cp))
#define VPP(_p) ((void**)(_p))
#define LST for(n=0; n < 10; n++) { p = CP(low); while(p < CP(hi)){ c = *p;
#define LEND c1 += r; p++;} }
# ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
# endif
# ifndef MAX
# define MAX(a,b) ((a) > (b) ? (a) : (b))
# endif
MODULE = CRT PACKAGE = CRT
void
Be()
PPCODE:
PUTBACK;
HMODULE h_orig;
GetModuleHandleExW(
GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS,
&PL_No[0],
&h_orig
);
const IMAGE_DOS_HEADER* h = (const IMAGE_DOS_HEADER*)h_orig;
PIMAGE_NT_HEADERS nt_header = (PIMAGE_NT_HEADERS const)(VP(h) + VP(h->e_lfanew));
WORD NumberOfSections = nt_header->FileHeader.NumberOfSections;
/* IMAGE_FIRST_SECTION() macro is universal, and corrects for OPTIONAL32 vs OPTIONAL64 */
const IMAGE_SECTION_HEADER * sec = IMAGE_FIRST_SECTION(nt_header);
const IMAGE_SECTION_HEADER * secEnd = sec + NumberOfSections;
char * low =~0;
char * hi = 0;
char * lowi;
char * hii;
U32 r;
U32 c1 = 0;
U8 c;
char * p;
int n =0;
for(; sec < secEnd ; sec++) {
lowi = VP(h) + VP(sec->VirtualAddress);
hii = lowi + VP(sec->SizeOfRawData);
low = MIN(low,lowi);
hi = MAX(hi,hii);
}
BTIMESTART;
LST r = c; LEND
BTIMEEND("cache wake");
BTIMESTART; LST
tolower(c);
LEND
BTIMEEND("tolower");
BTIMESTART; LST
_tolower(c);
LEND
BTIMEEND("_tolower");
BTIMESTART; LST
toLOWER_A(c);
LEND
BTIMEEND("toLOWER_A");
BTIMESTART; LST
toLOWER_L1(c);
LEND
BTIMEEND("toLOWER_L1");
BTIMESTART; LST
isgraph(c);
LEND
BTIMEEND("isgraph");
BTIMESTART; LST
isGRAPH_A(c);
LEND
BTIMEEND("isGRAPH_A");
BTIMESTART; LST
isGRAPH_L1(c);
LEND
BTIMEEND("isGRAPH_L1");
BTIMESTART; LST
isalnum(c);
LEND
BTIMEEND("isalnum");
BTIMESTART; LST
isALPHANUMERIC_A(c);
LEND
BTIMEEND("isALPHANUMERIC_A");
BTIMESTART; LST
isALPHANUMERIC_L1(c);
LEND
BTIMEEND("isALPHANUMERIC_L1");
const char * dllname = 0;
unsigned char flag;
unsigned char f_type;
unsigned char f_len;
HANDLE h2;
dllname = (char *)&ms_crt_dllnames;
typedef int (__cdecl * isctypefn_t)(int);
h2 = LoadLibrary("msvcrt");
if(h2) {
isctypefn_t pfn = (isctypefn_t)GetProcAddress(h2, "isalnum");
if(pfn) {
BTIMESTART; LST
pfn(c);
LEND
BTIMEEND("isalnum msvcrt");
}
}
h2 = LoadLibrary("msvcr100.dll");
if(h2) {
isctypefn_t pfn = (isctypefn_t)GetProcAddress(h2, "isalnum");
if(pfn) {
BTIMESTART; LST
pfn(c);
LEND
BTIMEEND("isalnum msvcr100");
}
}
h2 = LoadLibrary("msvcr120.dll");
if(h2) {
isctypefn_t pfn = (isctypefn_t)GetProcAddress(h2, "isalnum");
if(pfn) {
BTIMESTART; LST
pfn(c);
LEND
BTIMEEND("isalnum msvcr120");
}
}
iscntrl(c);
isCNTRL_A(c);
isCNTRL_L1(c);
ispunct(c);
isPUNCT_A(c);
isPUNCT_L1(c);
isspace(c);
isSPACE_A(c) ;
isSPACE_L1(c) ;
isxdigit(c);
isXDIGIT_A(c);
isXDIGIT_L1(c);
isdigit(c);
isDIGIT_A(c);
isDIGIT_L1(c);
isalpha(c);
isALPHA_A(c);
isALPHA_L1(c);
XSRETURN_IV(c1);
return;
MODULE = CRTSlow PACKAGE = CRTSlow
BOOT:
{
MY_CXT_INIT;
/* If any of the fields in the my_cxt_t struct need
to be initialised, do it here.
*/
QueryPerformanceFrequency(&g_Frequency);
}
Expected behavior
Half joke half serious, but remove UCRT from default build config win perl and link against msvcrt.dll.
Perl configuration
Summary of my perl5 (revision 5 version 41 subversion 7) configuration:
Derived from: 73172a67eaae5671dffc06b427f005810d151472
Platform:
osname=MSWin32
osvers=6.1.7601
archname=MSWin32-x64-multi-thread
uname=''
config_args='undef'
hint=recommended
useposix=true
d_sigaction=undef
useithreads=define
usemultiplicity=define
use64bitint=define
use64bitall=undef
uselongdouble=undef
usemymalloc=n
default_inc_excludes_dot=define
Compiler:
cc='cl'
ccflags ='-nologo -GF -W3 -MD -TC -DWIN32 -D_CONSOLE -DNO_STRICT -DWIN64 -D
CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -D_WINSOCK_DEPRECATED_NO_WA
NINGS -DPERL_TEXTMODE_SCRIPTS -DMULTIPLICITY -DPERL_IMPLICIT_SYS -DWIN32_NO_REG
STRY -DUSE_PERLIO'
optimize='-O1 -Zi -GL -fp:precise'
cppflags='-DWIN32'
ccversion='19.36.32535'
gccversion=''
gccosandvers=''
intsize=4
longsize=4
ptrsize=8
doublesize=8
byteorder=12345678
doublekind=3
d_longlong=undef
longlongsize=8
d_longdbl=define
longdblsize=8
longdblkind=0
ivtype='__int64'
ivsize=8
nvtype='double'
nvsize=8
Off_t='__int64'
lseeksize=8
alignbytes=8
prototype=define
Linker and Libraries:
ld='link'
ldflags ='-nologo -nodefaultlib -debug -opt:ref,icf -ltcg -libpath:"c:\pb64
lib\CORE" -machine:AMD64 -subsystem:console,"5.02"'
libpth="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MS
C\14.36.32532\lib\x64"
libs=oldnames.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.l
b advapi32.lib shell32.lib ole32.lib oleaut32.lib netapi32.lib uuid.lib ws2_32.
ib mpr.lib winmm.lib version.lib odbc32.lib odbccp32.lib comctl32.lib msvcrt.li
vcruntime.lib ucrt.lib
perllibs=oldnames.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg
2.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib netapi32.lib uuid.lib ws2
32.lib mpr.lib winmm.lib version.lib odbc32.lib odbccp32.lib comctl32.lib msvcr
.lib vcruntime.lib ucrt.lib
libc=ucrt.lib
so=dll
useshrplib=true
libperl=perl541.lib
gnulibc_version=''
Dynamic Linking:
dlsrc=dl_win32.xs
dlext=dll
d_dlsymun=undef
ccdlflags=' '
cccdlflags=' '
lddlflags='-dll -nologo -nodefaultlib -debug -opt:ref,icf -ltcg -libpath:"c
\pb64\lib\CORE" -machine:AMD64 -subsystem:console,"5.02"'
Characteristics of this binary (from libperl):
Compile-time options:
HAS_LONG_DOUBLE
HAS_TIMES
HAVE_INTERP_INTERN
MULTIPLICITY
PERLIO_LAYERS
PERL_COPY_ON_WRITE
PERL_DONT_CREATE_GVSV
PERL_HASH_FUNC_SIPHASH13
PERL_HASH_USE_SBOX32
PERL_IMPLICIT_SYS
PERL_MALLOC_WRAP
PERL_OP_PARENT
PERL_PRESERVE_IVUV
PERL_USE_SAFE_PUTENV
USE_64_BIT_INT
USE_ITHREADS
USE_LARGE_FILES
USE_LOCALE
USE_LOCALE_COLLATE
USE_LOCALE_CTYPE
USE_LOCALE_NUMERIC
USE_LOCALE_TIME
USE_NO_REGISTRY
USE_PERLIO
USE_PERL_ATOF
USE_THREAD_SAFE_LOCALE
Locally applied patches:
uncommitted-changes
Built under MSWin32
Compiled at Dec 20 2024 10:03:46
%ENV:
PERL_DOBK="1"
PERL_DOBP="1"
PERL_DODB="1"
@INC:
C:/pb64/site/lib/MSWin32-x64-multi-thread
C:/pb64/site/lib
C:/pb64/lib