From 0c86c56e8e1f58b67dbdd759298aabad35711603 Mon Sep 17 00:00:00 2001 From: Alexander Freudenberg <50404848+alexfreudenberg@users.noreply.github.com> Date: Fri, 26 May 2023 08:39:52 +0200 Subject: [PATCH] Merge Martin's version 25/05/23 (#6) Merge changes to core miraculix by Martin --- .gitignore | 2 + src/miraculix/1bit128.cc~ | 171 -- src/miraculix/2bitUint.cc | 2 +- src/miraculix/3bitUint.cc | 2 +- src/miraculix/5codes512.cc | 8 +- src/miraculix/5codesAPI.c | 6 +- src/miraculix/5codesChar.cc~ | 434 ----- src/miraculix/5codesIntern.h | 193 ++- src/miraculix/5codesUint.cc | 24 +- src/miraculix/Files32.cc | 8 +- src/miraculix/FilesUint.cc | 3 +- src/miraculix/MXinfo.h | 1 + src/miraculix/MoBPS_R.cc | 2 - src/miraculix/Template.h | 14 +- src/miraculix/Vector.matrix.Uint.cc | 2 +- src/miraculix/compatibility.C.h | 6 +- src/miraculix/compatibility.R.h | 6 +- src/miraculix/compatibility.c.cc | 7 + src/miraculix/compatibility.general.h | 3 +- src/miraculix/compatibility.sexp.cc | 2 +- src/miraculix/haplogeno.cc | 8 +- src/miraculix/intrinsics.h | 5 +- src/miraculix/intrinsicsIntern.h | 4 +- src/miraculix/kleinkram.cc | 1 - src/miraculix/main.cc | 89 +- src/miraculix/makefile.c.mk | 7 +- src/miraculix/makefile.miraculix.mk~ | 22 - src/miraculix/makefile.rfu.mk | 2 +- src/miraculix/makefile.rfu.mk~ | 24 - src/miraculix/options_rrfu.cc | 284 ++++ src/miraculix/parallel_simd.h | 3 +- src/miraculix/plink.h | 2 +- src/miraculix/plink256.cc | 4 +- src/miraculix/plinkUint.cc | 7 +- src/miraculix/solve_rrfu.cc | 2223 +++++++++++++++++++++++++ src/miraculix/transformUint.cc | 2 +- src/miraculix/xport_import_rrfu.cc | 51 + tests/dgemm_compressed/test.jl | 2 +- 38 files changed, 2802 insertions(+), 834 deletions(-) delete mode 100755 src/miraculix/1bit128.cc~ delete mode 100755 src/miraculix/5codesChar.cc~ delete mode 100755 src/miraculix/makefile.miraculix.mk~ delete mode 100755 src/miraculix/makefile.rfu.mk~ create mode 100755 src/miraculix/options_rrfu.cc create mode 100755 src/miraculix/solve_rrfu.cc create mode 100755 src/miraculix/xport_import_rrfu.cc diff --git a/.gitignore b/.gitignore index 6c104721..6dd69f46 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ *.freq *.bed *.mem +*.*~ + *.optrpt *.tgz diff --git a/src/miraculix/1bit128.cc~ b/src/miraculix/1bit128.cc~ deleted file mode 100755 index fd698dce..00000000 --- a/src/miraculix/1bit128.cc~ +++ /dev/null @@ -1,171 +0,0 @@ -/* - Authors - Guido Moerkotto - maintained and modified by Martin Schlather, martin.schlather@uni-mannheim.de - - Copyright (C) 2022-2023 Guido Moerkotte, Martin Schlather - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - - - -#define BitsPerCode 1 -#define MY_VARIANT 128 - -#include "Basic_miraculix.h" -#include "intrinsics_specific.h" -#include "xport_import.h" -#include "options.h" -// -// - -#if defined SSSE3 - -ASSERT_SIMD(1bit128, ssse3); - -#include "MX.h" -#include "haplogeno.h" -#include "intrinsicsIntern.h" -#include "1bitIntern.h" - - - -void trafo1Geno2Geno128(unit_t *OLD, Long snps, Long individuals, - Long oldLDA, unit_t *Ans, Long ldAns) { - assert(2 * oldLDA == ldAns); - - Long blocks = Blocks(snps); - - if (OLD == Ans) { - ERR0("not programed yet"); - // (Braucht nur 1/4 Speicherplatz extra) - // (i)drittes Viertel rauskopieren + 1-2 Spalten mehr vorne freilasen - // (ii) haelfte durcharbeitne - // (iii) drittel Vierel aus speicher + in die Spalte(n) davor im - // Speicher viertel Viertel spaltenweise reinkopieren - // (iv) letztes Viertel aus dem Speicher mit den 1-2 Spalten frueher - // abarbeiten. - } - - - const BlockType - O2F2 = SET32(0x00FF00FF), - F2O2 = SET32(0xFF00FF00), - - O4F4 = SET32(0x0000FFFF), - F4O4 = SET32(0xFFFF0000); - - -#if ! defined PlainInteger32 - const BlockType O8F8 = SET64(0x00000000FFFFFFFF); -#endif - -#if defined AVX512F - const BlockType cross_idx = - _mm512_set_epi32(TAKE_B + 14, TAKE_B + 12, TAKE_B + 10, TAKE_B + 8, - TAKE_B + 6, TAKE_B + 4, TAKE_B + 2, TAKE_B + 0, - 14, 12, 10, 8, 6, 4, 2, 0); -#elif defined AVX2 - const BlockType cross_idxA = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); - const BlockType cross_idxC = _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4); - const int halfreg_idx = 0 + (2 << 4); -#elif defined SSE2 - const int cross_idxA = 0 + (1 << 4); - const int cross_idxC = 2 + (3 << 4); -#elif defined PlainInteger64 || defined PlainIteger32 - #define cross_idxA 0 - #define cross_idxC 1 -#endif - - - - for (Long j=0; jbasic.Cprintlevel) { - const char *ft [] ={"false", "true", "tba", "tba"}; - int G = (int) global->tuning.gpu << 1; - PRINTF("\n"); - PRINTF("gpu : %s\n", ft[global->tuning.gpu]); - if (!global->tuning.gpu) { - if ( global->genetics.centered != NoCentering) - PRINTF("ignore missings: %s\n", ft[G + global->tuning.missingsFully0]); - PRINTF("do center: %s\n", ft[G + (global->genetics.centered != NoCentering)]); - if (global->tuning.floatLoop >=0) - PRINTF("float : %s\n", ft[G + global->tuning.floatLoop > 0]); - else PRINTF("precise : true\n"); - PRINTF("meanV : %s\n", ft[G + global->tuning.meanVsubstract]); - PRINTF("meanSxI : %s\n", ft[G + global->tuning.meanSxIsubstract]); - PRINTF("normalize: %s\n", ft[G + global->genetics.normalized] ); - PRINTF("variant : %d \n", global->tuning.variant); - } - PRINTF("use fortran freq: %s\n", - ft[G + global->genetics.prefer_external_freq] ); - PRINTF("cores : %d \n", utils->basic.cores ); - PRINTF("print lvl: %d \n", utils->basic.Cprintlevel ); - } -} - - - -void get_started(option_type **g, utilsoption_type **u) { - static option_type *global = NULL; - static utilsoption_type *utils = NULL; - if (global == NULL) { - startMiraculix(1); - WhichOptionList(false, &global, &utils); - - // printf(">>>>>>>>> global %ld\n", (Long) global); - - setOptions5( -#if defined CUDA - true, -#else - false, -#endif - 0, // cores - false, // floatLoop - false, // meanV - false, // meanSxI - false, // irgnore missings - RowMeans, // centering - NoNormalizing, // normalize - true, // use Fortran freq - 32, - false // print details - ); - print_options(global, utils); - Ext_startRFU(); - // printf(" >> "); //, (Long) (&OPTIONS), (Long) utils; - } - *g = global; - *u = utils; - assert(global != NULL); - assert(utils != NULL); - // printf("getstarted_done %ld\n", (Long) global); - if (utils->basic.Cprintlevel > 2) { - PRINTF("<<<<< global %ld\n", (Long) global); - print_options(global, utils); - } -} - - - -void setOptions5(int gpu, int cores, int floatLoop, - int meanVsubstract, int meanSxIsubstract, - int missingsFully0, - int centered, int normalized, - int prefer_external_freq, - int variant, int print_details) { - if (print_details > 10) PRINTF("entering setoptions cores = %d\n", cores); - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - if (print_details > 10) PRINTF("B %ld %ld cores = %d\n", (Long) global, (Long) utils, cores); - tuning_options *tuning = &(global->tuning); - genetics_options *genetics = &(global->genetics); - if (cores == 0) { - const char* s = getenv("OMP_NUM_THREADS"); - if (s != NULL) cores = (int) strtoimax(s, NULL, 10); - if (cores <= 0) { - PRINTF("WARNING: seeing 'OMP_NUM_THREADS=%s' only -- #threads set to 4\n", s); - cores = 4; - //PRINTF("number of threads not positive"); BUG; - } - } - if (print_details > 10) PRINTF("cores = %d\n", cores); - utils->basic.cores = cores; - tuning->gpu = (bool) gpu; - if (gpu && - (!prefer_external_freq || missingsFully0 || centered || normalized)) - ERR("in case of 'gpu' the Fortran frequency must always be used; missings/centering/normalizing cannot treated."); - tuning->floatLoop = floatLoop; - tuning->meanVsubstract = (bool) meanVsubstract; - tuning->meanSxIsubstract = (bool) meanSxIsubstract; - tuning->missingsFully0 = (bool) missingsFully0; - genetics->centered = (centering_type) centered; - genetics->prefer_external_freq = prefer_external_freq; - genetics->normalized = (normalizing_type) normalized; - tuning->variant = variant < 32 ? 32 : variant > 256 ? 256 : variant; - utils->basic.efficient = false; - utils->basic.Cprintlevel = print_details * 5; - tuning->addtransposed = true; - if (print_details > 10) { - print_options(global, utils); - PRINTF("setoptions5 done \n"); - } -} - - -#define TBM 0x03 -void plink2Geno5codestrans32(Uchar *M, Long snps, Long individuals, - int VARIABLE_IS_NOT_USED cores, - unit_t *Ans, Long ldAns) { - // printf("p25T enter\n"); - if (sizeof(Uchar) * BitsPerByte != MY_VARIANT) BUG; - Uchar *table = PLINK2FIVE; - - Long - ldM = 1L + (individuals - 1L) / CpByteplink, // in bytes!!! - end_snpsM1_CpB = (snps - 1L) / CpByte; - -#ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static) -#endif - for (Long i=0; i<=end_snpsM1_CpB; i++) { - Long rest = CpByte; - Uchar *pM = M + ldM * CpByte * i; - Uchar *a0 = ((Uchar*) (Ans + ldAns * i )); // !! - Uchar h[BitsPerByte]; // CpByte <= BitsPerByte ! - - if (i < end_snpsM1_CpB) {} else { - rest = snps - end_snpsM1_CpB * rest; - assert(rest > 0); - for (Long ll=rest; ll>= BitsPerCodeplink; - } - } - } - // printf("p25T done\n"); -} - - -#if defined LONG_HANGING_TYPE -#undef LONG_HANGING_TYPE -#endif -//#define LONG_HANGING_TYPE Long -//#define HTT_BITS 64 -#define LONG_HANGING_TYPE Long -#define HTT_BITS 64 -#define ORIGBITS_TRANS_MASK 0x00000000000003FF -// irgendwo fehler drin -- sollte eigentlich bisschen schneller gehen -void plink2Geno5codes32(Uchar *M, Long snps, Long individuals, - int VARIABLE_IS_NOT_USED cores, - unit_t *Ans, Long ldAns) { - // printf("p25 enter\n"); - Uchar *table = PLINK2FIVE; - Long ldM = 1L + (individuals - 1L) / CpByteplink; // in bytes!!! - Long end_indiM1_CpB = (individuals -1L) / CpByte; - Long ldaByte = ldAns * BytesPerUnit; - Long totalBytes = ldM * snps; - - if (MY_LDABITALIGN_2BIT < sizeof(Long)) BUG; - if (HTT_BITS != sizeof(LONG_HANGING_TYPE) * BitsPerByte) BUG; - -#ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static) -#endif - for (Long i=0; i>= movedBits; - } - *a = table[m & ORIGBITS_TRANS_MASK]; - m >>= ORIGBITSperFIVE; - availableBits -= ORIGBITSperFIVE; - } - } - // printf("p25 done\n"); -} - - -void plinkTo5(char *plink, char *plink_transposed, - Long snps, Long indiv, - double *f, - int max_n, - void**compressed) { - // printf("pto 5 do\n"); - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - stopIfNotInt(snps | indiv); - if (global->tuning.gpu) { - plink2gpu((char*) plink, (char*) plink_transposed, - (int) snps, (int) indiv, f, max_n, - compressed); - return; - } - - basic_options *opt = &(utils->basic); - SEXP SxI = PROTECT(CreateEmptyCodeVector(snps, indiv, FiveCodes, - global->tuning.variant, 0, false, - global, utils)); - - *compressed = (void*) SxI; - int cores = GreaterZero(opt->cores); - unit_t *code = Align(SxI, opt); - Long *info = GetInfo(SxI); - Long ldAns = info[LDA]; - - if (global->tuning.missingsFully0) getPlinkMissings((Uchar*) plink, SxI, opt); - // printf("coding ...\n"); - plink2Geno5codes32((Uchar*) plink, snps, indiv, cores, code, ldAns); - - SEXP next = getAttribPointer(SxI, Next); - unit_t *codenext = Align(next, opt); - Long *infonext = GetInfo(next), - ldanext = infonext[LDA]; - - //printf("trans coding ...\n"); - plink2Geno5codestrans32((Uchar*)plink, snps, indiv, cores, codenext, ldanext); - if (global->genetics.prefer_external_freq) { - getFreq(SxI, f, global, utils); - } - // printf("plinkTo5 done\n"); -} - - -void vectorGeno5api(void *compressed, double *V, Long repetV, Long LdV, - double *ans, Long LdAns) { - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - if (global->tuning.gpu) { - // - stopIfNotInt(repetV | LdV | LdAns); - dgemm_compressed_gpu(false, compressed, (int) repetV, V, (int) LdV, - global->genetics.centered, - global->genetics.normalized, - ans, (int) LdAns); - } else { - vectorGeno_means_double((SEXP)compressed, V, repetV, LdV, global, utils, - ans, LdAns); - } -} - - -void genoVector5api(void *compressed, double *V, Long repetV, Long LdV, - double *ans, Long LdAns) { - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - if (global->tuning.gpu) { - stopIfNotInt(repetV | LdV | LdAns) - dgemm_compressed_gpu(true, compressed, (int) repetV, V, (int) LdV, - global->genetics.centered, - global->genetics.normalized, - ans, (int) LdAns); - } else { - genoVector_means_double((SEXP)compressed, V, repetV, LdV, global, utils, - ans, LdAns); - } -} - - - -void free5(void **compressed) { - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - if (global->tuning.gpu) { - freegpu(compressed); - } else { - FREE_SEXP((SEXP*) compressed); // will crash?!!! - } -} - - -void getFreq5(void *compressed, double *f) { - SEXP SxI = (SEXP) compressed; - // printINFO(SxI); - option_type *global; - utilsoption_type *utils; - get_started(&global, &utils); - Long *info = GetInfo(SxI); - Long - snps = info[SNPS], - rows = snps; - // Long individuals = info[INDIVIDUALS]; - getFreq(SxI, global, utils); - LongDouble *freq = getFreq(SxI); - for (Long j=0; jcores); /* NOT Long !! */ \ @@ -59,7 +60,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS char msg[200]; \ SPRINTF(msg, "%s Bit %s-%s required for %ld x %ld x %ld", \ #NR, #TRDTYPE, #ENDTYPE, rows, cols, repetV); \ - PRINTF("%s", msg); for (int i=(int) strlen(msg); i<57; i++) PRINTF(" "); \ + PRINTF("%s", msg); for (int i=(int) STRLEN(msg); i<57; i++) PRINTF(" "); \ PRINTF(" -> %3ld Bit, %2d cores\n", \ sizeof(AVXTYPE) * BitsPerByte, cores); \ } \ @@ -68,9 +69,10 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS const Long colsCpB = DIV_GEQ(cols, CpB); \ const Long colsCpBm1 = colsCpB - 1; \ const Long colBlocks = DIV_GEQ(colsCpB, AtOnce); \ - Long blockSliceLen = DIV_GEQ(colBlocks, cores * 5); \ + Long blockSliceLen = DIV_GEQ(colBlocks, cores * coreFactor); \ + if (false) printf("blockSliceLen=%ld\n", blockSliceLen); \ blockSliceLen = MAX(1, MIN(SLICELEN, blockSliceLen)); \ - Long blocks = DIV_GEQ(colBlocks, blockSliceLen); /* (cores * 5) repetV */; \ + Long blocks = DIV_GEQ(colBlocks, blockSliceLen); /* (ca. cores * 5) repetV */; \ /* for large repetV, use repetV only for splitting */ \ /* blocks = blocks > cores ? cores : blocks > 1 ? blocks : 1; */ \ const Long blocksM1 = blocks - 1L; \ @@ -100,22 +102,22 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS -#if defined AVX2 +#if defined AVX2 || defined AVX512F #define gV5_start(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ gV5_header(NR, TYPE, SNDTYPE, TRDTYPE, ENDTYPE) { \ const Long VatOnce = (sizeof(AVXTYPE) / sizeof(TRDTYPE)); \ - if (VatOnce == 4) { \ - const Long restV = repetV % VatOnce; \ - if (restV > 0 && restV <= 2) { \ - vector_##TYPE##_t vD = restV == 1 ? genoVector5v32_##TYPE \ - : genoVector5v128_##TYPE; \ - vD(code, rows, cols, lda, coding, variant, pseudoFreq, \ - V, restV, ldV, opt, tuning, Ans, ldAns); \ - repetV -= restV; \ - V += restV * ldV; \ - Ans += restV * ldAns; \ - } \ - } \ + const Long restV = repetV % VatOnce; \ + if (restV > 0 && \ + ((VatOnce == 8 && restV <= 4) || (VatOnce == 4 && restV <= 2))) { \ + vector_##TYPE##_t vD = restV == 1 ? genoVector5v32_##TYPE \ + : restV == 2 ? genoVector5v128_##TYPE \ + : genoVector5v256_##TYPE; \ + vD(code, rows, cols, lda, coding, variant, pseudoFreq, \ + V, restV, ldV, opt, tuning, Ans, ldAns); \ + repetV -= restV; \ + V += restV * ldV; \ + Ans += restV * ldAns; \ + } \ gV5_start0(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) #else #define gV5_start(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ @@ -125,7 +127,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS #endif -#define gV5_LoopOne(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_CreateHash(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ for (Long rr=0; rr < NVblocks; rr++) { \ AVXTYPE *f = F + rr * HashSizePerV; \ Long sEnd = MIN(VatOnce, repetV - rr * VatOnce); \ @@ -133,7 +135,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS TYPE *v_i = V + rr * VatOnce * ldV + i * CpB; \ AVXTYPE *hash0 = f + i * HashSize; \ int hastRest = CpB; \ - TYPE x[CpB] = {0, 0, 0, 0, 0};/* hier compiler fehler bei collapse!?! */ \ + SNDTYPE x[CpB] = {0, 0, 0, 0, 0};/* hier compiler fehler bei collapse!?! */ \ if (i < colsCpBm1) {} else { \ hastRest = (int) (cols - colsCpBm1 * hastRest); \ assert(hastRest > 0); \ @@ -143,32 +145,35 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS for (Long s=0; s 0); */ \ - for (Long i = 0 ; i 0; +*/ #if AtOnce == 4 #if defined AVX2 -#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ const AVXTYPE fC0 = AVXLOAD((TRDTYPE*)(ff0 + (int) pC0[b])); \ const AVXTYPE fC1 = AVXLOAD((TRDTYPE*)(ff1 + (int) pC1[b])); \ const AVXTYPE fC2 = AVXLOAD((TRDTYPE*)(ff2 + (int) pC2[b])); \ @@ -224,7 +259,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS AVXSTORE((TRDTYPE*)(t + b), AVXADD( tb, h)); \ } #else -#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ t[b] += (ff0[(int) pC0[b]] + ff1[(int) pC1[b]]) \ + (ff2[(int) pC2[b]] + ff3[(int) pC3[b]]); \ } @@ -232,7 +267,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS #elif AtOnce == 3 #if defined AVX2 -#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ const AVXTYPE fC0 = AVXLOAD((TRDTYPE*)(ff0 + (int) pC0[b])); \ const AVXTYPE fC1 = AVXLOAD((TRDTYPE*)(ff1 + (int) pC1[b])); \ const AVXTYPE fC2 = AVXLOAD((TRDTYPE*)(ff2 + (int) pC2[b])); \ @@ -243,7 +278,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS AVXSTORE((TRDTYPE*)(t + b), tb); \ } #else -#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ t[b] += ff0[(int) pC0[b]] + ff1[(int) pC1[b]] + ff2[(int) pC2[b]]; \ } #endif @@ -273,9 +308,9 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS #endif // AtOnce -#define gV5_LoopTwo(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ - gV5_Loop2A(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ - gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_MainLoop(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ + gV5_LoopA(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ + gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ }} \ }} \ if (false) PRINTF("calc end blocks=%ld\n", blocks); \ @@ -283,7 +318,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS #if AlteVersion == 0 -#define gV5_LoopSum(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_SumUp(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ for (Long rr=0; rr < NVblocks; rr++) { \ AVXTYPE *tmp = Tmp + rr * blocksP1Xrows; \ Long level = rows; \ @@ -308,7 +343,7 @@ if (false) PRINTF("end sum up\n"); #else -#define gV5_LoopSum(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_SumUp(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ for (Long rr=0; rr < NVblocks; rr++) { \ Long rowsVatonce = rows * VatOnce; \ TRDTYPE *s = (TRDTYPE*) (sum + rr * blocksP1Xrows); \ @@ -323,17 +358,17 @@ if (false) PRINTF("end sum up\n"); s[j] += (TRDTYPE) ((ENDTYPE) t0[j] + (ENDTYPE) t1[j] + \ (ENDTYPE) t2[j] + (ENDTYPE) t3[j]); \ } \ - } \ + } \ } \ if (false) PRINTF("!!! Old version of summing up (%ld, %ld)!!!\n", sizeof(TRDTYPE), sizeof(TRDTYPE)); - BUG; // no bug at all, just do not use this LoopSum + BUG; // no bug at all, just do not use this SumUp #endif -#define gV5_LoopEnd(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ +#define gV5_Sort(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE) \ for (Long rr=0; rr < NVblocks; rr++) { \ AVXTYPE *tmp = sum + rr * blocksP1Xrows; \ Long sEnd = MIN(VatOnce, repetV-rr * VatOnce); \ @@ -351,32 +386,33 @@ if (false) PRINTF("end sum up\n"); } // end function + + #if defined AVXLOAD #undef AVXLOAD #undef AVXSTORE #undef AVXADD #endif -#define AVXLOAD LOADuDOUBLE -#define AVXSTORE STOREuDOUBLE -#define AVXADD ADDDOUBLE -gV5_start(MY_VARIANT, double, LongDouble, double, Doubles, double) +#define AVXLOAD LOADuFLOAT +#define AVXSTORE STOREuFLOAT +#define AVXADD ADDFLOAT +gV5_start(MY_VARIANT, floatD, LongDouble, float, Floats, double) #ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static) +#pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopOne(double, LongDouble, double, Doubles, double) +gV5_CreateHash(floatD, LongDouble, float, Floats, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopTwo(double, LongDouble, double, Doubles, double) +gV5_MainLoop(floatD, LongDouble, float, Floats, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopSum(double, LongDouble, double, Doubles, double) +gV5_SumUp(floatD, LongDouble, float, Floats, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopEnd(double, LongDouble, double, Doubles, double) - +gV5_Sort(floatD, LongDouble, float, Floats, double) @@ -385,28 +421,27 @@ gV5_LoopEnd(double, LongDouble, double, Doubles, double) #undef AVXSTORE #undef AVXADD #endif -#define AVXLOAD LOADuFLOAT -#define AVXSTORE STOREuFLOAT -#define AVXADD ADDFLOAT -gV5_start(MY_VARIANT, floatD, LongDouble, float, Floats, double) +#define AVXLOAD LOADuDOUBLE +#define AVXSTORE STOREuDOUBLE +#define AVXADD ADDDOUBLE +gV5_start(MY_VARIANT, double, LongDouble, double, Doubles, double) #ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static) +#pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopOne(floatD, LongDouble, float, Floats, double) +gV5_CreateHash(double, LongDouble, double, Doubles, double) #ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static) +#pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopTwo(floatD, LongDouble, float, Floats, double) +gV5_MainLoop(double, LongDouble, double, Doubles, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopSum(floatD, LongDouble, float, Floats, double) +gV5_SumUp(double, LongDouble, double, Doubles, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopEnd(floatD, LongDouble, float, Floats, double) - - +gV5_Sort(double, LongDouble, double, Doubles, double) + #endif diff --git a/src/miraculix/5codesUint.cc b/src/miraculix/5codesUint.cc index e53739cb..ba77f3a7 100755 --- a/src/miraculix/5codesUint.cc +++ b/src/miraculix/5codesUint.cc @@ -232,19 +232,19 @@ gV5_start(32, longD, LongDouble, LongDouble, LongDouble, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopOne(longD, LongDouble, LongDouble, LongDouble, double) +gV5_CreateHash(longD, LongDouble, LongDouble, LongDouble, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopTwo(longD, LongDouble, LongDouble, LongDouble, double) +gV5_MainLoop(longD, LongDouble, LongDouble, LongDouble, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopSum(longD, LongDouble, LongDouble, LongDouble, double) +gV5_SumUp(longD, LongDouble, LongDouble, LongDouble, double) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopEnd(longD, LongDouble, LongDouble, LongDouble, double) +gV5_Sort(longD, LongDouble, LongDouble, LongDouble, double) @@ -252,19 +252,19 @@ gV5_start(32, LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopOne(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) +gV5_CreateHash(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopTwo(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) +gV5_MainLoop(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopSum(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) +gV5_SumUp(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopEnd(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) +gV5_Sort(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble) @@ -272,19 +272,19 @@ gV5_start(32, Ulong, Ulong, Ulong, Ulong, Ulong) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopOne(Ulong, Ulong, Ulong, Ulong, Ulong) +gV5_CreateHash(Ulong, Ulong, Ulong, Ulong, Ulong) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopTwo(Ulong, Ulong, Ulong, Ulong, Ulong) +gV5_MainLoop(Ulong, Ulong, Ulong, Ulong, Ulong) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopSum(Ulong, Ulong, Ulong, Ulong, Ulong) +gV5_SumUp(Ulong, Ulong, Ulong, Ulong, Ulong) #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(static) #endif -gV5_LoopEnd(Ulong, Ulong, Ulong, Ulong, Ulong) +gV5_Sort(Ulong, Ulong, Ulong, Ulong, Ulong) diff --git a/src/miraculix/Files32.cc b/src/miraculix/Files32.cc index 9e930c62..995a1e42 100755 --- a/src/miraculix/Files32.cc +++ b/src/miraculix/Files32.cc @@ -125,11 +125,9 @@ SEXP file_binary(char *file, char *file_coding, int header, bool isSNPxInd, matrix_size = matrixLDA * colsLDA, idxIncr = isSNPxInd ? matrixLDA : 1; Long lda = info[LDA]; - if ((matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit)) == NULL) - ERR0("memory space could not be acquired"); - char *buffer0 = (char*) CALLOC(blocks * bytesperblock +// OK - MaxByteAlign, 1), - *buffer = (char*)((1L + (uintptr_t) buffer0 / MaxByteAlign)*MaxByteAlign); + matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit); + char *buffer0 = (char*) CALLOC(blocks * bytesperblock + MaxByteAlign, 1);// OK + char *buffer =(char*)((1L + (uintptr_t) buffer0 / MaxByteAlign)*MaxByteAlign); if ((fp = fopen(file, "r")) == NULL) ERR1("file '%.50s' could not be opened", file); diff --git a/src/miraculix/FilesUint.cc b/src/miraculix/FilesUint.cc index a7d30f3a..f5d7f255 100755 --- a/src/miraculix/FilesUint.cc +++ b/src/miraculix/FilesUint.cc @@ -366,8 +366,7 @@ SEXP file_intern(SEXP file, coding_type coding, int variant, // printf("\n########### matrixLDA %d %d %d\n", matrixLDA, DeltaMatrix, isHaplo(coding)); - if ((matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit)) == NULL) - ERR0("memory space could not be acquired"); + matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit); // jump header lines fp = fopen(filename, "r"); diff --git a/src/miraculix/MXinfo.h b/src/miraculix/MXinfo.h index 08811ac7..cd24297a 100755 --- a/src/miraculix/MXinfo.h +++ b/src/miraculix/MXinfo.h @@ -60,6 +60,7 @@ static inline uintptr_t algn_generalL(int *X, Long LDAbitalign) { assert(ROUND_GEQ((uintptr_t) X, f) >= (uintptr_t) X); return ROUND_GEQ((uintptr_t) X, f); } + static inline void *algn_generalS(SEXP Code, Long LDAbitalign) { assert(getAttribPointer(Code, Information) == R_NilValue); uintptr_t X = (uintptr_t) INTEGER(Code); diff --git a/src/miraculix/MoBPS_R.cc b/src/miraculix/MoBPS_R.cc index faa36823..f5456c77 100755 --- a/src/miraculix/MoBPS_R.cc +++ b/src/miraculix/MoBPS_R.cc @@ -650,8 +650,6 @@ SEXP compute(SEXP popul, SEXP Generation, SEXP Sex, SEXP Nr, memInUnits = (Long) individuals * ldaHPL, mem = calculateAlignedMem(memInUnits, TwoBitHaplo); int *G = (int *) CALLOC(mem, BytesPerUnit); - if (G == NULL) ERR1("mem allocation (%u bytes)", - (Uint) (mem * BytesPerUnit)); unit_t *g = (unit_t*) algn_generalL(G, MY_LDABITALIGN); if ((Uint) LENGTH(Vec) != individuals) ERR0("'vec' not of correct length"); diff --git a/src/miraculix/Template.h b/src/miraculix/Template.h index 2a6354f2..989c1fe3 100755 --- a/src/miraculix/Template.h +++ b/src/miraculix/Template.h @@ -50,14 +50,17 @@ typedef void (*coding_t)(unit_t *, Long, Long, Long, Long, Long, #define vector_header(TYPE, ENDTYPE) \ /* !!! not snps / indivduals for FiveCodesTransposed !!! */ \ - (unit_t *code, Long rows, Long cols, Long lda, \ + (unit_t VARIABLE_IS_NOT_USED *code, \ + Long VARIABLE_IS_NOT_USED rows, Long VARIABLE_IS_NOT_USED cols, \ + Long VARIABLE_IS_NOT_USED lda, \ coding_type VARIABLE_IS_NOT_USED coding, \ int VARIABLE_IS_NOT_USED variant, \ LongDouble VARIABLE_IS_NOT_USED * pseudoFreq, \ - TYPE *V, Long repetV, Long ldV, \ + TYPE VARIABLE_IS_NOT_USED *V, Long VARIABLE_IS_NOT_USED repetV, \ + Long VARIABLE_IS_NOT_USED ldV, \ basic_options VARIABLE_IS_NOT_USED *opt, \ tuning_options VARIABLE_IS_NOT_USED *tuning, \ - ENDTYPE *Ans, Long ldAns) + ENDTYPE VARIABLE_IS_NOT_USED *Ans, Long VARIABLE_IS_NOT_USED ldAns) typedef void (*vector_double_t) vector_header(double, double); @@ -121,11 +124,12 @@ typedef void (*gV_vG_Ulong_t) gV_vG_header0(Ulong); #define CodesPerLong ((int) (sizeof(Long) * BitsPerByte / BitsPerCode)) #define sparseTGeno_start(NAME) \ sparseTGeno_header(NAME){ \ - if (nrow_code > (Long)MAXINT || nrow_code < (Long)CodesPerLong || \ - ncol_code > (Long)MAXINT) BUG; \ + if (nrow_code > (Long)MAXINT || ncol_code > (Long)MAXINT) BUG; \ const basic_options *opt = &(utils->basic); \ const int cores = GreaterZero(opt->cores); +// || nrow_code < (Long)CodesPerLong + typedef void (*sparseTGeno_t) sparseTGeno_Header; diff --git a/src/miraculix/Vector.matrix.Uint.cc b/src/miraculix/Vector.matrix.Uint.cc index d05d923a..c46d0332 100755 --- a/src/miraculix/Vector.matrix.Uint.cc +++ b/src/miraculix/Vector.matrix.Uint.cc @@ -281,7 +281,7 @@ gV_raw(LongDouble) void VectorRelMatrix( SEXP SxI, SEXP SxI2, double *V, Long repetV, int compare, option_type *global, utilsoption_type *utils, double *ans) { - const int debug = false; + const int debug = !false; // if (false) { printINFO(SxI); BUG; } diff --git a/src/miraculix/compatibility.C.h b/src/miraculix/compatibility.C.h index f1bbc4bd..5471c959 100755 --- a/src/miraculix/compatibility.C.h +++ b/src/miraculix/compatibility.C.h @@ -67,7 +67,7 @@ extern int errno ; #define EXP exp #define FABS(X) fabs((double) X) // OK; keine Klammern um X! #if ! defined MALLOCX -#define MALLOCX malloc +#define MALLOCX(X) notNull(malloc(X), __LINE__, __FILE__) #define FLOOR floor #define SQRT(X) sqrt((double) X) // OK #define CEIL(X) ceil((double) X) // OK; keine Klammern um X! @@ -85,8 +85,8 @@ extern int errno ; #define MEMMOVE memmove #define MEMSET memset #define MEMCMP memcmp -#define AALLOC aligned_alloc -#define CALLOCX calloc +#define AALLOC(X) notNull(aligned_alloc(X), __LINE__, __FILE__) +#define CALLOCX(X,N) notNull(calloc(X,N), __LINE__, __FILE__) #define SPRINTF sprintf // Rprintf #define QSORT qsort #define RFERROR(X) {fprintf(stderr, "%s", X); exit(EXIT_FAILURE); } diff --git a/src/miraculix/compatibility.R.h b/src/miraculix/compatibility.R.h index 90c11ddd..239e0e4b 100755 --- a/src/miraculix/compatibility.R.h +++ b/src/miraculix/compatibility.R.h @@ -44,7 +44,7 @@ #define EXP std::exp #define FABS(X) std::fabs((double) X) // OK; keine Klammern um X! #if ! defined MALLOCX -#define MALLOCX std::malloc +#define MALLOCX(X) notNull(std::malloc, __LINE__, __FILE__) #define FLOOR std::floor #define SQRT(X) std::sqrt((double) X) // OK #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X! @@ -62,8 +62,8 @@ #define MEMMOVE std::memmove #define MEMSET std::memset #define MEMCMP std::memcmp -#define AALLOC std::aligned_alloc -#define CALLOCX std::calloc +#define AALLOC notNull(std::aligned_alloc, __LINE__, __FILE__) +#define CALLOCX notNull(std::calloc, __LINE__, __FILE__) #define SPRINTF std::sprintf // Rprint #define QSORT std::qsort #define RFERROR error diff --git a/src/miraculix/compatibility.c.cc b/src/miraculix/compatibility.c.cc index c45b1b08..7bf89086 100755 --- a/src/miraculix/compatibility.c.cc +++ b/src/miraculix/compatibility.c.cc @@ -78,3 +78,10 @@ void stopIfNotAnyIntI(Long i, Long line, const char *file) { ERR3("value (%ld) not an integer at line %ld in %s\n", i, line, file); } + + +void *notNull(void *X, int line, const char *file) { + if (X!=NULL) return X; + ERR2("No memory was allocated at line %d of %s.\n", line, file); + return NULL; +} diff --git a/src/miraculix/compatibility.general.h b/src/miraculix/compatibility.general.h index e0712398..59cf7d35 100755 --- a/src/miraculix/compatibility.general.h +++ b/src/miraculix/compatibility.general.h @@ -80,5 +80,6 @@ typedef long double LongDouble; #define ASSERT_SOLVE(sp) if (sp == NULL) sp = &(OPTIONS.solve); else {} #endif - +void *notNull(void *X, int line, const char *file); + #endif diff --git a/src/miraculix/compatibility.sexp.cc b/src/miraculix/compatibility.sexp.cc index 845135db..366ce688 100755 --- a/src/miraculix/compatibility.sexp.cc +++ b/src/miraculix/compatibility.sexp.cc @@ -135,7 +135,7 @@ SEXP allocObject(int type, Long len, void *x) { Ans->x = CALLOC(1, sizeof(void*)); *((void**) (Ans->x)) = x; } else Ans->x = len == 0 ? R_NilValue : CALLOC(Memory(type), len); - // printf("don %ld\n", (Long) (Ans->x)); + return Ans; } diff --git a/src/miraculix/haplogeno.cc b/src/miraculix/haplogeno.cc index 07c1c44d..b92e2124 100755 --- a/src/miraculix/haplogeno.cc +++ b/src/miraculix/haplogeno.cc @@ -1068,8 +1068,8 @@ void do_centering(Long Snps, Long Individuals, centering_type centered, } if (transposed) centered = centered == RowMeans ? ColMeans : centered == ColMeans ? RowMeans : centered; - double *sums = (double *) MALLOC(sizeof(double) * cols), - factor = 1.0; + double factor = 1.0, + *sums = (double *) MALLOC(sizeof(double) * cols); switch(centered) { case RowMeans: @@ -1216,7 +1216,6 @@ double *crossprod(unit_t * Code, Long snps, Long individuals, Long lda, basic_options *opt, tuning_options *tuning) { // never squared here!!! double *A = (double *) MALLOC(individuals * individuals * sizeof(double)); - if (A == NULL) ERR0("mem allocation"); crossprodI(Code, snps, individuals, lda, coding, variant, centered, normalized, squared, Precision, @@ -1373,7 +1372,6 @@ void allele_sum_I(SEXP SxI, FREE(E); } - assert(sum != NULL); if (EightByte) { for (Long i=0; iefficient = false; + opt->efficient = variant >= 512; CLOCK("starting"); Long size = indiv * repetV; @@ -559,65 +573,65 @@ int main(int argc, char *argv[]) { CLOCK("transforming to OrigPlink"); UNPROTECT(1); } - assert(code != NULL); int p_nrow = sparse_nrow < 10 ? sparse_nrow : 10; int p_ncol = snps < 20 ? (int) snps : 20; int p_indiv = indiv < 10 ? (int) indiv : 10; - for (int k=0; k<30; k++) +#define repetSparse 10 + for (int k=0; k +#include "parallel_simd.h" +#ifdef TIME_AVAILABLE +#include +#endif + + + + +#include "Basic_RFUlocal.h" // must be before anything else +#include "kleinkram.h" +#include "zzz_RFU.h" +#include "options_RFU.h" +#include "extern_RFU.h" +#if defined compatibility_to_R_h +#include "xport_import_RFU.h" +#endif + +// IMPORTANT: all names of general must have at least 3 letters !!! +const char *basic[basicN] = + { "printlevel","cPrintlevel", "seed", "cores", + "skipchecks", "asList", "verbose", "helpinfo", "efficient", + "bigendian","warn_parallel", "exactness" + }; + +const char *installNrun[installNrunN] = + { "kahanCorrection", "warn_unknown_option", "la_mode", + "install","installPackages", "determineLAmode", "mem_is_aligned", + "gpuDevices", "maxStreams" + }; + +const char * solve[solveN] = + { "use_spam", "spam_tol", "spam_min_p", "svdtol", "eigen2zero", + "solve_method", "spam_min_n", "spam_sample_n", "spam_factor", "spam_pivot", + "max_chol", "max_svd", "pivot", + "pivot_idx", // dynamic parameter + "pivot_relerror", "pivot_max_deviation", "pivot_max_reldeviation", + "det_as_log", "pivot_actual_size", "pivot_check", "pseudoinverse", + "AtAmode", "AtAnrow", "AtAncol", "Strassen_min", "Strassen_factor" + //, "tmp_delete" + }; + + +const char * prefixlist[prefixN] = {"basic", "installNrun", "solve"}; +const char **allOptions[prefixN] = {basic, installNrun, solve}; +int allOptionsN[prefixN] = {basicN, installNrunN, solveN}; + + +utilsoption_type OPTIONS = { // OK + basic_START, + installNrun_START, + { solve_START }, + dummy_START +}; + + + + +int doPosDefIntern(double *M0, int size, bool posdef, + double *rhs0, Long rhs_cols, double *result, + double *logdet, int calculate, solve_storage *Pt, + solve_options *sp, int VARIABLE_IS_NOT_USED cores); + +void params_utilsoption(int local, int *params) { + utilsoption_type *from = WhichOptionList(local); + params[PIVOT_IDX_N] = from->solve.n_pivot_idx; +} + +void get_utils_basic(basic_options *opt, int local) { + utilsoption_type *from = WhichOptionList(local); + MEMCOPY(opt, from, sizeof(basic_options)); +} + +void get_utilsoption(utilsoption_type *S, int local) { + assert(solveN == 21 && basicN == 9 && installNrunN == 10 && prefixN==3); + utilsoption_type *from = WhichOptionList(local); + assert(from->solve.n_pivot_idx!=0 xor from->solve.pivot_idx == NULL); + assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL); + if (S->solve.n_pivot_idx != from->solve.n_pivot_idx) BUG; + int *save_idx = S->solve.pivot_idx, + n = S->solve.n_pivot_idx < from->solve.n_pivot_idx ? + S->solve.n_pivot_idx : from->solve.n_pivot_idx; + MEMCOPY(S, from, sizeof(utilsoption_type)); // OK + S->solve.pivot_idx = save_idx; + if (n > 0) { + MEMCOPY(S->solve.pivot_idx, from->solve.pivot_idx, + sizeof(*(S->solve.pivot_idx)) * n); + } +} + +void push_utilsoption(utilsoption_type *S, int local) { + utilsoption_type *to = WhichOptionList(local); + assert(to->solve.n_pivot_idx!=0 xor to->solve.pivot_idx == NULL); + assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL); + int *save_idx = to->solve.pivot_idx; + if (to->solve.n_pivot_idx != S->solve.n_pivot_idx) { + FREE(to->solve.pivot_idx); + to->solve.pivot_idx = (int*) MALLOC(S->solve.n_pivot_idx * sizeof(int)); + save_idx = to->solve.pivot_idx; + } + MEMCOPY(to, S, sizeof(utilsoption_type)); // OK + to->solve.pivot_idx = save_idx; + if (S->solve.n_pivot_idx > 0) { + MEMCOPY(to->solve.pivot_idx, S->solve.pivot_idx, + sizeof(*(to->solve.pivot_idx)) * S->solve.n_pivot_idx); + } +} + +void del_utilsoption(utilsoption_type *S) { + FREE(S->solve.pivot_idx); + S->solve.n_pivot_idx = 0; +} + +void update_utilsoption() { + push_utilsoption(&OPTIONS, true); +} + +#define FASTER 1.3 // 1.3 as for typical application of likelihood, +// the determinant calculation in RandomFieldsUtils is for free. Somehow a +// balance +int own_chol_up_to(int size, int maxtime, int VARIABLE_IS_NOT_USED cores) { +#ifdef TIME_AVAILABLE + if (size <= 0) return true; + Long delta[2]; + solve_options sp; + solve_storage pt; + solve_NULL(&pt); + MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); + sp.Methods[0] = Cholesky; + sp.Methods[1] = NoFurtherInversionMethod; + sp.pivot_mode = PIVOT_NONE; + sp.sparse = False; + double old_quotient = RF_NAN; + // basic assumption is that R implementation's getting faster + // for larger matrices + // printf("**** start\n"); + while (true) { + // printf("x \n"); + int sizeP1 = size + 1, + sizesq = size * size, + loops = size > 64 ? 1 : 16384 / ((size + 8) * (size+8)) / 4; + double *M = (double*) MALLOC(sizesq * sizeof(double)); + for (int j=0; j<=1; j++) { + // printf("%d,%d\n", j, loops); + SetLaMode(j == 0 ? LA_INTERN : LA_R, cores); + clock_t start = clock(); + for (int k=0; k 1) M[1] = M[size] = 1e-5; + //printf("size=%d\n", size); + doPosDefIntern(M, size, true, NULL, 0, NULL, NULL, MATRIXSQRT, &pt, &sp, + cores); + //printf("doen\n"); + } + delta[j] = (Long) clock() - start; + if (delta[j] < 0) delta[j] += MAXINT; // manual: 32bit repres. of clock + } + FREE(M); + if (PL > 2) + PRINTF("Cholesky decomposition for a %d x %d matrix needs %ld and %ld [mu s] on R and facile code on %d cores (#%d), respectively.\n", size, size, delta[1], delta[0], cores, loops); + + // printf("delta %d %d %d\n", delta[0], delta[1], maxtime); + + if (delta[0] > maxtime || delta[1] > maxtime || + (double) delta[0] >= FASTER * (double) delta[1]){ + solve_DELETE0(&pt); + if ((maxtime > 0 && + (delta[0] > 10 * maxtime || delta[1] > 10 * maxtime)) || + delta[0] > 2 * delta[1] || delta[1] > 2 * delta[0]) { + // seems to be time consuming. So stop. + return (double) delta[0] < FASTER * (double) delta[1] + ? MAXINT : (size <= 0 ? 0 : size / 2); + } + break; + } + old_quotient = (double) delta[0] / (double) delta[1]; + size *= 2; + } + double new_quotient = (double) delta[0] / (double) delta[1]; + if (new_quotient < FASTER) return MAXINT; + if (size <= 0) return(0); + if (!R_FINITE(old_quotient)) { + // printf("halfsize\n"); + int compare = own_chol_up_to(size / 2, 0, cores); + return compare == MAXINT ? size : compare; + } + double x0 = 0.5 * size * (1.0 + (FASTER - old_quotient) / + (new_quotient - old_quotient)); //lin interpol + assert(x0 >= 0.5 * size && x0 <= size); + int compare = own_chol_up_to((int) x0, 0, cores); + // printf("%f %f %f %f %d %d\n", x0,FASTER, old_quotient, new_quotient, size, compare); + return (int) (compare == MAXINT ? x0 : 0.5 * size); +#else + ERR0("option 'LA_AUTO' is available only on linux systems"); + return 0; +#endif +} +int own_chol_up_to(int VARIABLE_IS_NOT_USED cores) { + own_chol_up_to(256, 0, cores); //warm up for some BLAS implementatioan + // own_chol_up_to(256, 50000); + // own_chol_up_to(8, 50000); + return own_chol_up_to(256, 50000, cores); +} + + + +void SetLaMode(la_modes usr_mode, int VARIABLE_IS_NOT_USED cores) { + utilsoption_type *utils = &OPTIONS; + la_modes la_mode = usr_mode; + utils->solve.tinysize = + utils->installNrun.LaMaxTakeIntern = -1; +#define TINY_SIZE_MAX 3 + if (la_mode == LA_INTERN) { + utils->solve.tinysize = TINY_SIZE_MAX; + utils->installNrun.LaMaxTakeIntern = MAXINT; + } else if (la_mode == LA_AUTO) { + la_mode = HAS_GPU ? LA_GPU : LA_R ; +#if defined TIME_AVAILABLE +# ifdef SCHLATHERS_MACHINE +#else + int PLalt = PL; + PL = 0; +# endif + utils->installNrun.LaMaxTakeIntern = own_chol_up_to(cores); + utils->solve.tinysize = MIN(TINY_SIZE_MAX, utils->installNrun.LaMaxTakeIntern); + if (PL > 0) + PRINTF("Limit size for facile Cholesky algorithm = %d\n", + utils->installNrun.LaMaxTakeIntern); +# ifdef SCHLATHERS_MACHINE +#else + PL = PLalt; +# endif +#endif + } + + if ((la_mode == LA_GPU || la_mode == LA_R) && + utils->solve.pivot_mode > PIVOT_AUTO) + ERR0("Pivotized Cholesky decomposition has not been implemented yet for GPU and the LAPACK library"); + + utils->installNrun.la_mode = la_mode; +} +void SetLaMode() { + utilsoption_type *utils = &OPTIONS; + SetLaMode(utils->installNrun.la_usr, + GreaterZero(utils->basic.cores)); +} + + + + + +utilsoption_type *WhichOptionList(bool local) { + if (local) { + #if defined compatibility_to_R_h + KEY_type *KT = KEYT(); + if (KT == NULL) BUG; + return &(KT->global_utils); +#else + ERR0("'local' must be 'false' in standalone packages.") +#endif + } + return &OPTIONS; +} diff --git a/src/miraculix/parallel_simd.h b/src/miraculix/parallel_simd.h index 771cae6e..70c05090 100755 --- a/src/miraculix/parallel_simd.h +++ b/src/miraculix/parallel_simd.h @@ -18,12 +18,11 @@ */ - #ifndef parallel_omp__H #define parallel_omp__H 1 // #define NEVER_OMP 1 -#define NEVER_AVX512 1 +#define DO_AVX512F 1 // #define NEVER_AVX 1 // #define NEVER_SSE 1 diff --git a/src/miraculix/plink.h b/src/miraculix/plink.h index 980e736f..58503779 100755 --- a/src/miraculix/plink.h +++ b/src/miraculix/plink.h @@ -36,7 +36,7 @@ vectorGeno_header(double, double, double, Plink); vectorGeno_header(LongDouble, LongDouble, LongDouble, Plink); sumGeno_header(Plink); -sparseTGeno_header(OrigPlink); +sparseTGeno_header(Plink); vectorGeno_header(double, double, double, PlinkMatrix256); void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt); diff --git a/src/miraculix/plink256.cc b/src/miraculix/plink256.cc index ecc75c54..ed54ee31 100755 --- a/src/miraculix/plink256.cc +++ b/src/miraculix/plink256.cc @@ -62,7 +62,7 @@ IMPORTANT FUNCTIONS IN 2bit256.cc vectorGeno_header(double, double, double, PlinkMatrix256) { - // calculates v * t(geno) for some geno matrix given by "code" + // calculates t(v) * geno for some geno matrix given by "code" // In : V : snps x repetV // code : snps x indiv // Out: Ans : indiv x repetV @@ -166,8 +166,6 @@ vectorGeno_header(double, double, double, PlinkMatrix256) { // printf("bytesTV = %ld %ld\n", bytesTV, bytesTans); Doubles *TV = (Doubles*) MALLOC(bytesTV); Doubles *Tans = (Doubles*) MALLOC(bytesTans); - assert(TV != NULL); - assert(Tans != NULL); #define miniIndiv1 8 diff --git a/src/miraculix/plinkUint.cc b/src/miraculix/plinkUint.cc index 93ed51ee..4e915d27 100755 --- a/src/miraculix/plinkUint.cc +++ b/src/miraculix/plinkUint.cc @@ -313,7 +313,7 @@ void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt) { if (snp##N + f + 1 <= snpsEnd) snps##N = snp##N[f]; \ else { \ Uchar *S = (Uchar*) (snp##N + f); \ - Uchar TMP[TMP_LEN] = { \ + Uchar TMP[TMP_LEN] = { /* LittleEndian! */ \ S + 0 < (Uchar*) snpsEnd ? S[0] : (Uchar) 0, \ S + 1 < (Uchar*) snpsEnd ? S[1] : (Uchar) 0, \ S + 2 < (Uchar*) snpsEnd ? S[2] : (Uchar) 0, \ @@ -349,7 +349,8 @@ void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt) { iChunk+=miniIndivChunk; \ nonzeros-=miniIndivChunk -sparseTGeno_start(OrigPlink) +sparseTGeno_start(Plink) + // sparse * t(code) // NOTE: for better readability only, redefine nrow & ncol by their // typical meaning in (sparse) %*% t(code), namely snps and individuals const Long snps = nrow_code; @@ -367,7 +368,7 @@ sparseTGeno_start(OrigPlink) double *ans = Ans + iRchunk * ldAns; assert(sizeof(Ulong) == TMP_LEN); #ifdef DO_PARALLEL -#pragma omp parallel for num_threads(cores) schedule(static,staticSize) ordered +#pragma omp parallel for num_threads(cores) // schedule(static) // schedule(static,staticSize) ordered #endif for (int j=0; jerr_msg +#endif +#include "kleinkram.h" +#include "zzz_RFU.h" + +//#include "xport_import_RFU.h" +extern const char * solve[solveN]; +#define SCALAR(A,B,C) scalarX(A,B,C,NR) +#define LINEAR(A,B,C,D) linearX(A,B,C,D, linear_avx) + + +AVAILABLE_SIMD + + + +const char * InversionNames[nr_InversionMethods] = { + "cholesky", "svd", "eigen", "sparse", + "method undefined", + "qr", "lu", + "no method left", + "GPU-cholesky", + "R chol implementation", + "direct formula", + "diagonal"}; + + +#define KAHAN false // O P TIONS.installNrun.kahanCorrection + +#define C_MALLOC(INT, WHICH, N, TYPE) { \ + INT _N_ = N; \ + if (pt->n_##WHICH < _N_) { \ + if (pt->n_##WHICH < 0) BUG; \ + FREE(pt->WHICH); \ + pt->n_##WHICH = _N_; \ + if ((pt->WHICH = (TYPE *) CALLOC(_N_, sizeof(TYPE))) == NULL) \ + return ERRORMEMORYALLOCATION; \ + } else { \ + assert( (_N_ > 0 && pt->WHICH != NULL) || _N_ == 0); \ + for (INT iii=0; iii<_N_; pt->WHICH[iii++] = 0); \ + } \ + } \ + TYPE VARIABLE_IS_NOT_USED *WHICH = pt->WHICH + +#define CMALLOC(WHICH, N, TYPE) C_MALLOC(int, WHICH, N, TYPE) + + +/* +// sqrtPosDef nutzt pt->U fuer das Ergebnis +#define FREEING_RESULT(WHICH) \ + assert(int VARIABLE_IS_UNUSED *_i = WHICH); \ + if (pt->WHICH != NULL && pt->WHICH != result) { \ + UNCONDFREE(pt->WHICH); \ + pt->n_##WHICH = 0; \ + } +*/ + + +double Determinant(double *M, int size, bool logarithm) { + Long sizeP1 = size + 1, + sizeSq = (Long) size * size; + if (logarithm) { + double tmp = 0.0; + for (Long i=0; ito_be_deleted); +} + +void solve_DELETE(solve_storage **S) { + solve_storage *x = *S; + if (x!=NULL) { + solve_DELETE0(*S); + UNCONDFREE(*S); + } +} +void solve_NULL(solve_storage* x) { + if (x == NULL) return; + MEMSET(x, 0, sizeof(solve_storage)); + x->nsuper = x->size = -1; + x->method = NoInversionMethod; + for (int i=0; inewMethods[i++] = NoInversionMethod); + x->actual_pivot = PIVOT_UNDEFINED; +} + +double static inline det3(double *M, int size) { + double det; + switch(size){ // Abfrage nach Groesse der Matrix M + Berechnung der Determinante per Hand + case 1: det = M[0]; + break; + case 2: det = M[0] * M[3] - M[1] * M[2]; + break; + case 3: det = + M[0] * (M[4] * M[8] - M[5] * M[7]) + - M[1] * (M[3] * M[8] - M[5] * M[6]) + + M[2] * (M[3] * M[7] - M[4] * M[6]); // Entwicklung nach 1. Spalte + break; + default : BUG; + break; + } + return det; +} + +int logdet3(double det, bool posdef, double *logdet, bool logarithm) { + if (posdef && det < 0) return ERRORFAILED; + if (logdet != NULL) { + if (logarithm) { + if (det <= 0) return ERRORFAILED; + *logdet = LOG(det); + } else *logdet = det; + } + return NOERROR; +} + +int solve3(double *M, int size, bool posdef, + double *rhs, int rhs_cols, + double *result, double *logdet, bool logarithm, + solve_storage *pt + ){ + + assert(size <= 3); + if (size <= 0) SERR0("matrix in 'solvePosDef' of non-positive size."); + + double det = det3(M, size); + if (logdet3(det, posdef, logdet, logarithm) != NOERROR) return ERRORFAILED; + + double detinv = 1.0 / det; // determinant of inverse of M + + switch(size){ + case 1 : {// size of matrix == 1 + if (rhs_cols == 0) result[0] = detinv; + else for (int i=0; i 0.0 ? M[size] / res[0] : 0.0; + double dummy = M[size + 1] - res[size] * res[size]; + res[size + 1] = SQRT(MAX(0.0, dummy)); + if (size == 2) return NOERROR; + res[2] = res[5] = 0.0; + res[6] = res[0] > 0.0 ? M[6] / res[0] : 0.0; + res[7] = res[4] > 0.0 ? (M[7] - res[3] * res[6]) / res[4] : 0.0; + dummy = M[8] - res[6] * res[6] - res[7] * res[7]; + res[8] = SQRT(MAX(0.0, dummy)); + return NOERROR; +} + + +void Sort(double *RESULT, int size, Long rhs_cols, int *pi, int *rank, + double *dummy) { + if (size > MAXINT) BUG; + orderingInt(pi, (int) size, 1, rank); + int i=0; + Long totalRHS = size * rhs_cols; + while(i < size && i == rank[i]) i++; + while (i < size) { + int stored_i = i, + read = i; + double *p_write = NULL, + *p_read = RESULT + read; + for (Long k=0; k 60) schedule(dynamic, 20) +#endif + for (Long k=0; k 60) schedule(dynamic, 20) +#endif + for (Long k=0; kk; i--) { + double *pM = MPT + i * size, + r = (p_RESULT[i] /= pM[i]); + diagonal[k] -= r *pM[k]; + LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1); + // for (int j=k+1; j MAXINT) BUG; + assert(NA_LOGICAL == INT_MIN && NA_LOGICAL == NA_INTEGER); // nur zur sicherheit, wegen usr_bool + // eigentlich sollte usr_bool unabhaengig davon funktionieren + assert(calculate != DETERMINANT || + (logdet != NULL && result == NULL && rhs0 == NULL)); + assert(calculate != MATRIXSQRT || (rhs0 == NULL && posdef)); + assert((rhs_cols != 0) xor (rhs0 == NULL)); + + double *RESULT = result != NULL ? result : rhs_cols > 0 ? rhs0 : M0; + + // Pivot_Cholesky: + // if (MPT == Morig || (rhs_cols > 0 && rhs == RESULT)) + // CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer."); + + // !MATRIXSQRT && rhs_cols > 0 + // assert(rhs != RESULT); + la_modes la_mode = OPTIONS.installNrun.la_mode; + if (size <= sp->tinysize) { + if (Pt != NULL) { + Pt->method = direct_formula; + Pt->size = size; + } + if (calculate == DETERMINANT) + return logdet3(det3(M0, size), posdef, logdet, sp->det_as_log); + else if (calculate == MATRIXSQRT) return chol3(M0, size, RESULT, Pt); + else return solve3(M0, size, posdef, rhs0, + (int) rhs_cols, RESULT, logdet, + sp->det_as_log, Pt); + } + + // printf("size=%d %d %d sparse=%d\n", size, sp->pivot, PIVOT_AUTO, sp->sparse); + // ERR0("XXXX"); + // printf("%d %d %d\n", PIVOT_AUTO, direct_formula,Pt->method); + + assert(SOLVE_METHODS >= 2); + + // printf("A\n"); + + // printf("A1\n"); + int err = NOERROR; + Long VARIABLE_IS_NOT_USED spam_zaehler = 0; // if spam is not compiled + int nnzA = 0; + Long + sizeSq = (Long) size * size, + sizeRHS = (Long) size * rhs_cols; + int sizeP1 = size + 1; + // printf("A1dd\n"); +#if defined compatibility_to_C_h + usr_bool sparse = False; +#else + usr_bool sparse = sp->sparse; +#endif + double spam_tol = sp->spam_tol; + // printf("A2\n"); + bool diag = false, + useGPU = la_mode == LA_GPU && + (calculate == SOLVE || calculate == DETERMINANT); + // printf("A3\n"); + + + // printf("A %d %d size=%d %d %d \n", sparse,Nan ,size, useGPU, sp->n_spam_min[useGPU]); + if (sparse == Nan && (sparse = (usr_bool) (size > sp->spam_min_n[useGPU]))) { + // printf("AB2\n"); + double mean_diag = 0.0; + for (Long i=0; i= sp->spam_sample_n * 3; + if (random_sample) { + // printf("A2E\n"); + double + thr = sp->spam_sample_n * (1.0 - sp->spam_min_p[useGPU]); + Long + threshold = (Long) (thr + SQRT(thr) * 3), + notZero = 0; + for (Long i=0; ispam_sample_n; i++) { + // printf("A2 %d %d\n", i , sp->spam_sample_n); + if ((notZero += !(FABS(M0[(i * sp->spam_factor) % sizeSq]) <= + spam_tol)) >= threshold){ + sparse = False; + break; + } + } + if (PL >= PL_FCTN_DETAILS) { + PRINTF("random sampling: sparse=%d\n", + sparse == Nan ? NA_INTEGER : (int) sparse); + } + } + /// printf("A2EX %d %d\n", random_sample, sparse == True); + if (!random_sample || sparse == True) { + // printf("AdC2\n"); + Long diag_nnzA = 0; + //#ifdef DO_PARALLEL + //#pragma omp parallel for num_threads(cores) schedule(dynamic,10) reduction(+:nnzA,diag_nnzA) + //#endif + for (Long i=0; i spam_tol + // bei NA/NaN + for (j=i * size; jspam_min_p[useGPU])); + spam_zaehler = nnzA + 1; + // printf("ddAdC2\n"); + if (PL >= PL_DETAILSUSER) { + if (diag) { PRINTF("diagonal matrix detected\n"); } + else if (sparse == True) { + PRINTF("sparse matrix detected (%3.2f%% zeros)\n", + 100.0 * (1.0 - (double) nnzA / (double) sizeSq)); + } else { + PRINTF("full matrix detected (%3.2f%% nonzeros)\n", + 100.0 * (double) nnzA / (double) sizeSq); } + } + } + } else { + // printf("ttAdC2\n"); + diag = true; + for (Long i=0; inewMethods; + pt->method = NoFurtherInversionMethod; + pt->size = size; + + + // printf("BA\n"); + if (diag) { + pt->method = Diagonal; + if (PL>=PL_STRUCTURE) { PRINTF("dealing with diagonal matrix\n"); } + if (logdet != NULL) { + *logdet = Determinant(M0, size, sp->det_as_log); + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + if (rhs_cols == 0) { + MEMCOPY(RESULT, M0, sizeSq * sizeof(*RESULT)); + if (calculate == MATRIXSQRT) { + for (Long i=0; i 0.0 ? SQRT(M0[i]) : 0.0; + } + } else { + for (Long i=0; iMethods[from] != NoFurtherInversionMethod && + sp->Methods[from] != NoInversionMethod) { + if (sp->Methods[from] == Sparse && sparse == True) from++; + else Meth[to++] = sp->Methods[from++]; + } + + // printf("from = %d (%d %d) [%d %d %d] sparse=%d %d\n", from, sp->Methods[0], sp->Methods[1], Meth[0], Meth[1], Meth[2], sparse == True, Sparse); + + if (from == 0) { // user did not give any method + if (posdef) { + if (to < SOLVE_METHODS) { + Meth[to++] = useGPU ? GPUcholesky : Cholesky; + if (to < SOLVE_METHODS) { + Meth[to++] = sp->pivot_mode != PIVOT_NONE && useGPU ? Cholesky : Eigen; + } + } + } else { + Meth[to++] = LU; + } + } else { + if (useGPU) + for (int i=0; i 0 + || + (SOLVE_METHODS > first_not_reading_M0 + 1 && + Meth[first_not_reading_M0 + 1] != Meth[first_not_reading_M0] && + Meth[first_not_reading_M0 + 1] != NoFurtherInversionMethod) + || + (Meth[first_not_reading_M0] == SVD && sp->svd_tol > 0.0 && + calculate != SOLVE) + ) { // at least two different Methods in the list + C_MALLOC(Long, main, sizeSq, double);//to pt->main, main local variable + MPT = pt->main; + if (rhs_cols > 0) { + C_MALLOC(Long, rhs, sizeRHS, double);//to pt->main, main local variab + RHS = pt->rhs; + } + } + } + } + + + // printf("AFF\n"); + + errorstring_type ErrStr; + STRCPY(ErrStr, ""); + + // printf("Meth=%d %d Chol=%d %d posdef=%d\n", Meth[0], Meth[1], Cholesky, SOLVE_METHODS, posdef); + + // for (int i=0; ipivot_mode; + for (int m=0; mmethod = Meth[m]; + if (pt->method == Cholesky && size > OPTIONS.installNrun.LaMaxTakeIntern) { + pt->method = calculate == DETERMINANT ? LU : Rcholesky; + } + if (pt->method < 0) break; + if (calculate != SOLVE) { + if (pt->method == NoInversionMethod && m<=sparse) BUG; + if (pt->method == NoFurtherInversionMethod) break; + if (PL>=PL_STRUCTURE) { + PRINTF("method to calculate the square root : %s\n", + InversionNames[pt->method]); + } + } else { + if (PL>=PL_STRUCTURE) { + PRINTF("method to calculate the inverse : %s\n", + InversionNames[pt->method]); + } + } + + if (MPT != M0 && m >= first_not_reading_M0) + MEMCOPY(MPT, M0, sizeSq * sizeof(*MPT)); + + if (RHS != rhs0) MEMCOPY(RHS, rhs0, (Long) sizeRHS * sizeof(*RHS)); + + switch(pt->method) { + case GPUcholesky : + if (size > 28000) + GERR0("due to a bug at NVIDIA the maximum size is currently limited to 28000."); + + if (!posdef) CERR0("Cholesky needs positive definite matrix"); +#ifdef USEGPU + if (proposed_pivot > PIVOT_AUTO) + GERR0("cholesky decomposition on GPU does not allow for pivoting"); + pt->actual_pivot = PIVOT_NONE; + { + double LD, + *LogDet = logdet == NULL ? &LD : logdet; + err = // int; see errors_messages.h for values 0...4 + cholGPU(true, // bool : in : says that values must be copied + // so in miraculix, RFU_AlexChol(false, ....) + // can be called + M0,// in: this matrix is copied by Alex because + // of value 'true' in the first argument, + // so contents never distroyed by Alex + size, // in: size of the matrix + rhs0, //in: if NULL the inverse of M is calculated; + // rhs is copied by Alex because of value 'true' + // in the first argument,, so never distroyed by Alex + rhs_cols, // in: number of columns on the right hand side + LogDet, // out : logarithm of the determinant of + // the sqare root(!) of the matrix M + RESULT); // out: a pointer to the result whether or + // not rhs is given + if (err != NOERROR) { + if (proposed_pivot == PIVOT_AUTO) proposed_pivot = PIVOT_DO; + continue; + } + if (logdet != NULL) { + *logdet *= 2; + if (!sp->det_as_log) *logdet = EXP(*logdet); + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + } +#else + // err = NOERROR; + BUG; +#endif + break; + case Rcholesky : { + // printf("chol (R)\n"); + if (calculate == SOLVE) { + if (rhs_cols > MAXINT) BUG; + int n_rhs = (int) rhs_cols; + double *mm = NULL; + CMALLOC(xja, size, int); + if (rhs_cols == 0) { + // printf("hier %ld %ld %ld res=%ld %ld %ld\n", RESULT, MPT, M0, result, rhs0, RHS); + if (RESULT == MPT){ + Long bytes = sizeSq * sizeof(*RESULT); + mm = (double *) MALLOC(bytes); + MEMCOPY(mm, M0, bytes); + } + + MEMSET(RESULT, 0, sizeof(*RESULT) * sizeSq); + for (Long i=0; i MAXINT) BUG; + n_rhs = size; + + /* + + for (Long i=0; idet_as_log, + xja); + FREE(mm); // NOTE: return / errrors only afterwards! + if (err != NOERROR) GERR0("LU algorithm failed."); + } else if (calculate == MATRIXSQRT) { + if (MPT != RESULT) MEMCOPY(RESULT, MPT, sizeSq * sizeof(*RESULT)); + F77dpotrf("U", &size, RESULT, &size, &err +#ifdef USE_FC_LEN_T + FCONE +#endif + ); + if (logdet != NULL) { + Determinant(RESULT, size, sp->det_as_log); + if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet; + } + Long sizeM1 = size - 1; + for (Long i=0; i sp->max_chol) { + CERR2("Matrix is too large for Cholesky decomposition. Maximum ist currently a %d x %d matrix. Increase 'max_chol' in 'RFoption' if necessary.", + sp->max_chol, sp->max_chol); + } + + /// printf("size = %d %d %d\n", size, rhs_cols, size > rhs_cols ? size : rhs_cols); + + C_MALLOC(Long, D, size > rhs_cols ? size : rhs_cols, double); + for (Long i=0; iactual_pivot = PIVOT_UNDEFINED; + + if (proposed_pivot == PIVOT_NONE || proposed_pivot == PIVOT_AUTO) { + + // cmp for instance http://stackoverflow.com/questions/22479258/cholesky-decomposition-with-openmp + + // obere und untere dreiecksmatrix wird gelesen und obere geschrieben + err = NOERROR; + pt->actual_pivot = PIVOT_NONE; + { + double *A = MPT; + for (Long i=0; idet_as_log); + if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet; + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + + if (rhs_cols == 0) chol2inv(MPT, size, cores); + else { // rhs_cols > 0 + //Long totalRHS = size * rhs_cols; + //if (result!=NULL) MEMCOPY(RESULT, rhs, sizeof(*RESULT)*totalRHS); +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) if (rhs_cols > 30) schedule(static) +#endif + for (Long k=0; k 30) schedule(static) +#endif + for (Long k=0; k=0; i--) { + double *pM = MPT + i * size, + r = (p_RESULT[i] /= pM[i]); + LINEAR(pM, -r, i, p_RESULT); + // for (Long j=0; j PL_DETAILS) { PRINTF("trying pivoting\n"); } + int actual_size = NA_INTEGER; + // code according to Helmut Harbrecht,Michael Peters,Reinhold Schneider + // talk: The pivoted Cholesky decomposition and its application to + // stochastic PDEs + // ebenso: untere dreiecksmatrix wird gelesen; obere geschrieben + if (pt->actual_pivot == PIVOT_NONE) { + // wiederherstellung der Diagonalen und der unteren dreiecksmatrix + for (Long i=0; ipivot_idx); // ALWAYS FREE IT!!! cp Chol(S E X P M) + pt->pivot_idx = (int*) MALLOC((Long) size * sizeof(int)); + pt->n_pivot_idx = size; + pt->actual_pivot = PIVOT_DO; + for (int i=0; ipivot_idx[i] = i; + pt->actual_size = actual_size = size; + pi = pt->pivot_idx; + } else { // PIVOT_IDX + if (sp->n_pivot_idx < size || sp->actual_size > size) { + // printf("XA, %d %d %d\n", sp->n_pivot_idx , size, sp->actual_size); + CERR0("pivot_idx does not have the correct length.\nSet 'RFoption(pivot_idx=, pivot_actual_size=)' to the attributes of a\npivoted Cholesky decomposition."); + } + actual_size = pt->actual_size = sp->actual_size; + if (actual_size > size) BUG; + FREE(pt->pivot_idx); + Long bytes = (Long) size * sizeof(*(pt->pivot_idx)); + pt->pivot_idx = (int*) MALLOC(bytes); + MEMCOPY(pt->pivot_idx, sp->pivot_idx, bytes); + pt->actual_pivot = PIVOT_IDX; + pt->n_pivot_idx = sp->n_pivot_idx; + pi = sp->pivot_idx; + } + + err = NOERROR; + // printf("hier\n"); + double // *rhs = rhs, + // *M00 = M0, + rel_thres = 0, + max_deviation = sp->max_deviation, // 1e-10, + max_reldeviation = sp->max_reldeviation; // 1e-10, + + //printf("MTP %d %d %d\n", MPT == M0, rhs_cols, RHS == RESULT); + if (MPT == M0 || (rhs_cols > 0 && RHS == RESULT)) + CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer."); + /* + if (MPT == M0) { + CMALLOC(main, sizeSq, double); + MEMCOPY(main, M0, sizeSq * sizeof(*main)); + M00 = main; + } + if (rhs_cols > 0 && rhs == RESULT) { + Long totalRHS = (Long) size * rhs_cols; + CMALLOC(U, totalRHS, double); + MEMCOPY(U, rhs, totalRHS * sizeof(*U)); + RHS = U; + }*/ + for (Long q=0; qactual_pivot == PIVOT_DO) { + double + max = RF_NEGINF, + deviation = 0.0; + Long k, + argmax = NA_INTEGER; + for (k=q; kpivot_relerror * (double) sizeSq){ + C_GERR1("matrix not positive definite or increase 'pivot_relerror' by at least factor %10g.", dummy * -1e4 / (double) sizeSq, ERR_CHOL); + } + deviation += dummy; + if (max < dummy) { + max = dummy; + argmax = k; + } + } + + double dev = rel_thres * max_reldeviation; + if (deviation <= max_deviation || (q > 0 && deviation <= dev) ) { + if (q > MAXINT) BUG; + actual_size = pt->actual_size = (int) q; + if (sp->pivot_check != False) { + double largest = 0; + for (Long i=q; i largest ? absm : largest; + // if(absm == largest || absm > 5) printf("%10e %d %d; %d\n", absm, i, j, size); + } + } + if (largest > max_deviation || (q > 0 && largest > dev)) { + char msg[500]; + SPRINTF(msg, "Matrix has a numerically zero, leading value at the %d-th pivoted line, but the largest deviation %10e from zero in the rest of the matrix is greater than the tolerance %10e. %.50s.", + (int) q, + largest, + MAX(max_deviation, dev), + sp->pivot_check == True + ? "If you are sure that the matrix is semi-definite, set 'RFoptions(pivot_check=NA)' or 'RFoptions(pivot_check=True)'" + : "The result can be imprecise"); + if (sp->pivot_check == True) C_GERR0(msg, ERR_CHOL) + else WARN0(msg); + } + } + break; + } + rel_thres += D[pi[q]]; + int dummy = pi[q]; + pi[q] = pi[argmax]; + pi[argmax] = dummy; + } + + + Long pqq = pi[q], + col_q = pqq * size; + + + if (D[pqq] < 0) { + C_GERR1("Negative leading value found at the %d-th pivoted line.", + (int) q, ERR_CHOL); + } + double lqpq = MPT[q + col_q] = SQRT(D[pqq]); + +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) if (MULTIMINSIZE(size - q)) schedule(dynamic, 8) +#endif + for (Long i=q+1; iactual_pivot == PIVOT_DO) { + N = actual_size; + } + if (sp->det_as_log) { + if (N < size && !sp->pivot_partialdet) *logdet = RF_NEGINF; + else { + double logD = 0.0; + for (Long i=0; i < N; i++) + logD += LOG(MPT[i + pi[i] * (Long) size]); + *logdet = logD * 2; + } + } else { + if (N < size && !sp->pivot_partialdet) *logdet = 0; + else { + double logD = 1.0; + for (Long i=0; i < N; i++) + logD *= MPT[i + pi[i] * (Long) size]; + *logdet = logD * logD; + } + } + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + + ////////////////////////////////////////////////// + ////////////////////////////////////////////////// + + if (rhs_cols == 0) { + if (actual_size < size) + GERR0("Matrix not definite. Try ") + +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) if (size > 60) schedule(dynamic, 20) +#endif + for (Long k=0 ; k 60) schedule(dynamic, 20) +#endif + for (Long k=0; kk; i--) { + double *pM = MPT + pi[i] * (Long) size, + r = (p_RESULT[i] /= pM[i]); + D[k] -= r * pM[k]; + LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1); + // for (Long j=k+1; j 0 + // if (rhs0 == RESULT) { + // /* crash(); */ + // #pragma GCC diagnostic push + //#pragma GCC diagnostic ignored "-Wuninitialized" + // Long i; PRINTF("%d\n", i);char m[1];m[i] = m[i-9] + 4; if (m[0]) i++; else i--; PRINTF("%s\n", m); // not MEMCOPY + //#pragma GCC diagnostic pop + // int *x = (int*) MALLOC(1000000); f ree(x); f ree(x); x[100] = 100; + // } + // printf("%ld %ld %ld\n", rhs0 , RESULT, RHS); + //assert(rhs0 != RESULT); + // assert(RHS != RESULT); + double eps = D[0] * sp->pivot_relerror; + +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) if (rhs_cols > 30) schedule(static) +#endif + for (Long k=0; k eps) { + if (Pt == NULL) solve_DELETE(&pt); +#ifdef DO_PARALLEL + RFERROR("Equation system not solvable"); // RFERROR necessary. +#else + GERR1("Equation system not solvable (difference %10e). Try increasing 'pivot_relerror' in 'RFoptions' to get an approximate solution.", + p_rhs[pii] - SCALAR(pM, p_RESULT, i)); +#endif + + } + } + } + +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) schedule(static) if (rhs_cols > 30) +#endif + for (Long k=0; k=0; i--) { + Long pii = pi[i]; + double *pM = MPT + pii * size, + r = (p_RESULT[i] /= pM[i]); + LINEAR(pM, -r, i, p_RESULT); + // for (Long j=0; j 0 + } // not sqrt only + + } // err == NOERROR + } + + ERR_CHOL: + + if (err != NOERROR) { + if (pt->actual_pivot == PIVOT_NONE) + CERR2("Probably matrix not positive definite: %.300s. Consider 'RFoptions(%.80s)'.\n", + ErrStr, + calculate != SOLVE || rhs_cols > 0 ? + "'pivot=PIVOT_AUTO)' or 'RFoptions(pivot=PIVOT_DO" + : "solve_method=\"eigen\", solve.pseudoinverse=TRUE") // OK + + + else // pt->actual_pivot == PIVOT_DO or PIVOT_IDX + CERR1("Likely, the matrix is not positive semi definite: %.300s. Consider 'RFoptions(solve_method=\"svn\"\n", ErrStr) + } + + if (PL >= PL_DETAILSUSER) { + PRINTF("Cholesky decomposition successful\n"); + } + } + break; + case QR : {// QR returns transposed of the inverse !! + if (rhs_cols > 0 || logdet != NULL || calculate != SOLVE) { + err = ERRORFAILED; + continue; + } + + err = ERRORNOTPROGRAMMEDYET; /// to do: clarify transposed ! + continue; + + CMALLOC(w2, size, double); + CMALLOC(w3, size, double); + + F77dgeqrf(&size, &size, // QR + MPT, &size, // aijmax, &irank, inc, w2, + w3, w2, &size, &err); + assert(false); // code zur invertierung fehlt hier! + + if (err != NOERROR) { + CERR1("'dgeqrf' failed with err=%d.", err); + } + if (PL >= PL_DETAILSUSER) { PRINTF("QR successful\n"); } + break; + } + + case Eigen : { // M = U D UT + int max_eigen = sp->max_svd; + double eigen2zero = sp->eigen2zero; + if (size > max_eigen) CERR0("matrix too large for Cholesky or eigen value decomposition. Increase 'max_chol' and 'max_svd' in 'RFoption' if necessary."); + + double + optimal_work, + *pt_work = &optimal_work; + int k=0, + optimal_intwork, + *pt_iwork = &optimal_intwork, + lw2 = -1, + lintwork = -1; + + C_MALLOC(Long, U, sizeSq, double); + CMALLOC(D, size, double); + CMALLOC(xja, 2 * (Long) size, int); + CMALLOC(w3, size, double); + + for (int i=0; i<=1; i++) { + double dummy = 0.0, + abstol = 0.0; + int dummy_nr; + + // printf("%f %f %f %f\n", MPT[0], MPT[1], MPT[2], MPT[3]); + F77dsyevr("V", "A", "U", &size, // Eigen + MPT, &size, &dummy, &dummy, &k, &k, + &abstol,// or DLAMCH + &dummy_nr, D, U, &size, + xja, // 2 * size * size of(integer); nonzeros_idx + pt_work, &lw2, + pt_iwork, &lintwork, + &err +#ifdef USE_FC_LEN_T + FCONE FCONE FCONE +#endif + ); + // printf("eigen %d %f %f; %e %e %e %e\n", err, D[0], D[1], U[0], U[1], U[2], U[3]); + if (i==1 || err != NOERROR || ISNAN(D[0])) break; + lw2 = (int) optimal_work; + lintwork = (int) optimal_intwork; + CMALLOC(w2, lw2, double); + CMALLOC(iwork, lintwork, int); + pt_iwork = iwork; + pt_work = w2; + } + + + if (err != NOERROR) { + if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dsyevr' = %d\n", err);} + CERR1("'dsyevr' failed with err=%d.", err); + break; + } + + for (Long i=0; i -eigen2zero * 100]); + + } + + if (calculate == MATRIXSQRT) { + for (Long j=0; j= eigen2zero) dummy = SQRT(D[j]); + for (Long i=0; idet_as_log); + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + + // printf("Hiere EIGGEN\n"); + bool pseudoInverse = false; + for (Long j=0; j 0) { + Long tot = (Long) size * rhs_cols; + C_MALLOC(Long, w2, tot, double); + matmulttransposed(U, RHS, w2, size, size, rhs_cols, cores); + if (pseudoInverse) { + for (k=0; k eigen2zero) { + GERR0("singular matrix problem does not have a solution"); + } + k++; + } + } + } else { + for (k=0; kpseudoinverse) + GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'"); + C_MALLOC(Long, w2, sizeSq, double); + for (Long kk=0, j=0; j= PL_DETAILSUSER) { + PRINTF("eigen value decomposition successful\n"); + } + break; + } + + case SVD : {// SVD : M = U D VT + if (size > sp->max_svd) CERR0("matrix too large for SVD decomposition."); + int + lw2 = -1, + size8 = size * 8; + double optim_lwork, + eigen2zero = sp->eigen2zero, + *pt_w2 = &optim_lwork; + + C_MALLOC(Long, w3, sizeSq, double); + C_MALLOC(Long, U, sizeSq, double); + CMALLOC(D, size, double); + CMALLOC(iwork, size8, int); + CMALLOC(lnz, size, double); + + for (Long i=0; i<=1; i++) { + F77dgesdd("A", &size, &size, // SVD + MPT, &size, D, U, &size, w3, &size, + pt_w2, &lw2, iwork, &err +#ifdef USE_FC_LEN_T + FCONE +#endif + ); + if (i==1 || err != NOERROR || ISNAN(D[0])) break; + lw2 = (int) optim_lwork; + CMALLOC(w2, lw2, double); + pt_w2 = w2; + } + if (err != NOERROR) { + if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dgesdd' = %d\n", err);} + CERR1("'dgesdd' failed with err=%d.", err); + break; + } + + if (calculate == MATRIXSQRT) { + double svdtol = sp->svd_tol; + /* calculate SQRT of covariance matrix */ + for (Long j=0, k=0; j= eigen2zero) dummy = SQRT(D[j]); + for (Long i=0; i 0.0) { + for (Long i=0; i svdtol) { + if (PL > PL_ERRORS) { + PRINTF("difference %10e at (%ld,%ld) between the value (%10e) of the covariance matrix and the square of its root (%10e).\n", + M0[i * size +k] - sum, i, k, M0[i*size+k], sum); + } + FERR3("required precision not attained (%10e > %10e): probably invalid model. See also '%.50s'.", FABS(M0[i * size + k] - sum), svdtol, + solve[SOLVE_SVD_TOL]); + + err=ERRORM; + break; + } //else printf("ok (%d,%d) %10g %10g\n", i, k, M0[i*size+k],sum); + } + if (err != NOERROR) break; + } + if (err != NOERROR) break; + } // end if svdtol > 0 + + } else { + // calculate determinant + if (logdet != NULL) { + *logdet = cumProd(D, size, sp->det_as_log); + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + + bool pseudoInverse = false; + for (Long j=0; j 0) { + Long tot = (Long) size * rhs_cols; + C_MALLOC(Long, w2, tot, double); + matmulttransposed(U, RHS, w2, size, size, rhs_cols,cores); + if (pseudoInverse) { + for (Long k=0; k eigen2zero) { + GERR0("singular matrix problem does not have a solution."); + } + k++; + } + } + } else { + for (Long k=0; kpseudoinverse) + GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'"); + for (Long k=0, j=0; j= PL_DETAILSUSER) { PRINTF("svd successful\n"); } + break; + } + + case LU : {// LU + //printf("LU\n"); + if (calculate == MATRIXSQRT) { + err = ERRORFAILED; + continue; + } + + CMALLOC(xja, size, int); + F77dgetrf(&size, &size, MPT, &size, xja, &err); + if (err != NOERROR) { + CERR1("'dgetrf' (LU) failed with err=%d.", err); + } + + //printf("LU %d\n", logdet != NULL); + + + if (logdet != NULL) { + *logdet = DeterminantLU(MPT, size, sp->det_as_log, xja); + if (calculate == DETERMINANT) { + err = NOERROR; + goto ErrorHandling; + } + } + + if (rhs_cols > 0) { + if (rhs_cols > MAXINT) BUG; + int rhs_cols0 = (int) rhs_cols; + Long totalRHS = (Long) size * rhs_cols; + if (result != NULL) MEMCOPY(RESULT, RHS, sizeof(*RESULT) * totalRHS); + F77dgetrs("N", &size, // LU rhs + &rhs_cols0, MPT, &size, xja, + RESULT, &size, &err +#ifdef USE_FC_LEN_T + FCONE +#endif + ); + if (err != NOERROR) { + CERR1("'dgetrs' (LU) failed with err=%d.", err); + } + } else { + int lw2 = -1; + double dummy, + *p = &dummy; + for (int i=0; i<=1; i++) { + F77dgetri(&size, MPT, // LU solve + &size, xja, p, &lw2, &err); + if (err != NOERROR) break; + lw2 = (int) dummy; + CMALLOC(w2, lw2, double); + p = w2; + } + } + if (PL >= PL_DETAILSUSER) { PRINTF("LU decomposition successful\n"); } + break; + } + + case Sparse : {// sparse matrix +#if defined compatibility_to_C_h + BUG; +#else + if (sizeSq > MAXINT) BUG; + + Long halfsq = (Long) size * (size + 1) / 2; + int nnzlindx = -1, + doperm = sp->pivotsparse, + nnzcolindices = 0, + nnzR = 0, + cache = 512, // to do: CPU cache size + nnzcfact[3] = { 5, 1, 5 }, + nnzRfact[3] = { 5, 1, 2 }; + double + cholincrease_nnzcol = 1.25, + cholincrease_nnzR = 1.25; + + if (!posdef) CERR0("'spam' needs a positive definite matrix."); + CMALLOC(pivotsparse, size, int); + if (!doperm) for (int i=0; insuper = 0; + // calculate spam_cholesky + err = 4; // to get into the while loop + while (err == 4 || err == 5) { + if (nnzcolindices == 0) { + double rel = nnzA / (double) size; + if (rel < 5) { + nnzcolindices = (int) CEIL(nnzA * (1.05 * rel - 3.8)); + if (nnzcolindices < 1000) nnzcolindices = 1000; + } else { + nnzcolindices = nnzA; + } + nnzcolindices *= nnzcfact[doperm]; + if (nnzcolindices < nnzA) nnzcolindices = nnzA; + } else if (err == 5) { + int tmp = (int) CEIL(nnzcolindices * cholincrease_nnzcol); + if (PL > PL_RECURSIVE) { + PRINTF("Increased 'nnzcolindices' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR); + } + nnzcolindices = tmp; + } + if (nnzcolindices < pt->n_lindx) nnzcolindices = pt->n_lindx; + + if (nnzR == 0) { + double u = FLOOR(.4 * POW(nnzA, 1.2)); + u = u < 4 * nnzA ? 4 * nnzA : CEIL(u); + nnzR = (int) u * nnzRfact[doperm]; + } else if (err == 4) { + int tmp = (int) CEIL(nnzR * cholincrease_nnzR); + if (PL > PL_RECURSIVE) { + PRINTF("Increased 'nnzR' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR); + } + nnzR = tmp; + } + if (nnzR < pt->n_lnz) nnzR = pt->n_lnz; + else if (nnzR > halfsq) nnzR = (int) halfsq; + + CMALLOC(lindx, nnzcolindices, int); + CMALLOC(lnz, nnzR, double); + + F77cholstepwise(&size, &nnzA, D, cols, rows, &doperm, + invp, pivotsparse, + &nnzlindx, &nnzcolindices, + lindx, // + iwork,// + &(pt->nsuper), // length of lindx + &nnzR, // physical length of lindx + lnz, // output:result + xlnz, // cols of lnz "ja" + snode, // supernode membership ?? + xsuper, // supernode partioning + &cache, // cache size of the CPU + &err + ); + + if (err != NOERROR) { + CERR1("'cholstepwise' failed with err=%d.", err); + break; + } + } // while + + if (err != NOERROR) CERR0("'spam' failed."); + if (PL >= PL_DETAILSUSER) { PRINTF("'spam' successful\n"); } + + // spam solve + + if (calculate == MATRIXSQRT) { + + //BUG; // unexpected behaviour in spam + + nnzR = xlnz[size] - 1; + CMALLOC(xja, nnzR, int); + F77calcja(&size, &(pt->nsuper), pt->xsuper, + pt->lindx, pt->iwork, pt->xlnz, xja); + for (Long i=0; ilnz, xja, pt->xlnz, RESULT); + for (Long i=0; ilnz; + int *lindx = pt->lindx; + + // spam determinant + if (logdet != NULL) { + if (sp->det_as_log) { + double tmp = 0.0; + for (Long i=0; i MAXINT) BUG; + RHS_COLS = (int) rhs_cols; + if (result != NULL) + MEMCOPY(RESULT, RHS, (Long) size * rhs_cols * sizeof(*RESULT)); + } + + //printf("nsuper=%d\n", pt->nsuper); + // for (Long ii=0; iinsuper, sizeP1, xsuper[ii], + // w3[ii]); + + // if (false) + // for (Long jsub=0; jsub<=pt->nsuper; jsub++) { + // int fj = xsuper[1 - 1], + // Lj = xsuper[jsub + 1 - 1] -1; + // printf("%d %d %d\n", jsub, fj, Lj); + // for (Long jcol=fj; jcol <= Lj; jcol++) { + // printf("%d,%10g ", jcol, w3[jcol - 1]); + // } + // } + + // for (Long jcol=1; jcol <= 600; jcol++) { + // w3[jcol - 1] = jcol; + // printf("%d,%10g ", jcol, w3[jcol - 1]); + // } + + + // printf("%ld %ld %d\n", RESULT, rhs, rhs_cols); + // for (Long ii=0; iinsuper), &RHS_COLS, + lindx, // colindices + iwork, //colpointers + lnz, + xlnz, // rowpointers + invp, pivotsparse, + xsuper, // supernodes + w3, RESULT); + if (PL >= PL_DETAILSUSER) { PRINTF("'spam' successful\n"); } + } + break; +#endif + } // Sparse + + case NoInversionMethod: GERR0("no inversion method given."); + case NoFurtherInversionMethod: + STRCPY(ErrStr, WHICH_ERRORSTRING); + GERR1("%.300s (All specified matrix inversion methods have failed.)", + ErrStr); + case direct_formula: case Diagonal: + GERR1("strange method appeared:%.200s.", CONTACT); + default : GERR1("unknown method (%d) in 'RandomFieldsUtils'.", pt->method); + } // switch + + if (err==NOERROR) break; + } // for m + + + ErrorHandling: + if (Pt == NULL) solve_DELETE(&pt); + else Pt->sparse = sparse; + + // if (err != NOERROR) { printf("Err = %s %s %d\n", ErrStr, WHICH_ERRORSTRING ,err); exit(0);} + + return err; // -method; +} + + + + +int SolvePosDef(double *M, int size, bool posdef, + double *rhs, Long rhs_cols, double *logdet, + solve_storage *PT, + int VARIABLE_IS_NOT_USED cores) { + if ((rhs == NULL) xor (rhs_cols == 0)) BUG; + return doPosDefIntern(M, size, posdef, rhs, rhs_cols, + NULL, // result, so result returned in M or rhs + logdet, + SOLVE, // calculate + PT, // storage + &(OPTIONS.solve), cores); +} + + +int SolvePosDefSp(double *M, int size, bool posdef, + double *rhs, Long rhs_cols, double *logdet, + solve_storage *PT, solve_options * sp, int VARIABLE_IS_NOT_USED cores) { + if ((rhs == NULL) xor (rhs_cols == 0)) BUG; + return doPosDefIntern(M, size, posdef, rhs, rhs_cols, NULL, logdet, SOLVE, + PT, sp, cores); +} + + + + +int xCinvYdet(double *M, int size, bool posdef, double *X, double *Y, + Long cols, double *XCinvY, double *det, bool log, + solve_storage *PT, int VARIABLE_IS_NOT_USED cores) { + // called by randomfields + int NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX; + bool pt = PT != NULL && PT->result != NULL; + double *result; + if (pt) result=PT->result; + else result= (double *) MALLOC(sizeof(*result) * (Long) size * cols); + if (result == NULL) return ERRORMEMORYALLOCATION; + double *res = result; + solve_options sp; + MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); + sp.det_as_log = log; + int err = doPosDefIntern(M,// no PROTECT( needed + size, posdef, Y, cols, result, det, SOLVE, PT, &sp, + cores); + for (Long i=0; iMethods; + double *res = NULL; + bool extra_alloc = Meth[0] == NoInversionMethod || + Meth[0] == NoFurtherInversionMethod || + (Meth[1] != NoInversionMethod && Meth[1] != NoFurtherInversionMethod && + Meth[1] != Meth[0]) || + (Meth[0] != Cholesky && Meth[0] != Eigen && Meth[0] != SVD); + assert(pt != NULL); + + if (sp->sparse == True) + warning("package 'spam' is currently not used for simulation"); + sp->sparse = False; + + if (extra_alloc) { + C_MALLOC(Long, result, sizeSq, double); + res = result; + } else { + FREE(pt->result); + pt->result = M; + pt->n_result = sizeSq; + } + + // it is ok to have + // ==15170== Syscall param sched_setaffinity(mask) points to unaddressable byte(s) + // caused by gcc stuff + err = doPosDefIntern(M, size, true, NULL, 0, res, NULL, MATRIXSQRT, pt, sp, + cores);// no PROTECT( needed + + if (extra_alloc) { + + +#if defined MSDOS_WINDOWS + pt->to_be_deleted = M; +#else + FREE(M); +#endif + } + return err; +} + + +void sqrtRHS_Chol(double *U, int size, double* RHS, Long RHS_size, Long n, + double *result, + bool pivot, int act_size, + int cores, + int *pi) { + // printf("n=%d,rhss=%d si=%d pivot=%d, act=%d U=%d RHS=%d %d pi=%d\n", + // n, RHS_size, size,pivot, act_size, U!=NULL, RHS!=NULL, result!=NULL, pi!=NULL ); +// for (Long i=0; isize; + switch (pt->method) { + case Rcholesky : { + //printf("RchoL\n"); + int incx = 1; + MEMCOPY(result, RHS, (Long) size * sizeof(*result)); + F77dtrmv("U", "T", "N", &size, pt->result, &size, result, &incx +#ifdef USE_FC_LEN_T + FCONE FCONE FCONE +#endif + ); + } + break; + case GPUcholesky : + // Alex: Gegebenenfalls schnelle GPU Version von Deiner Seite + + case direct_formula : + case Cholesky : { + // printf("intern\n"); + bool pivot = (pt->actual_pivot == PIVOT_DO || + pt->actual_pivot == PIVOT_IDX) && + pt->method != direct_formula; + if (pivot && pt->n_pivot_idx != size) BUG; + + sqrtRHS_Chol(pt->result, size, RHS, size, 1, result, pivot, + pivot ? pt->actual_size : NA_INTEGER, cores, + pt->pivot_idx); + return NOERROR; + } + case SVD : case Eigen : { + double *U = pt->result; + assert(U != NULL); +#ifdef DO_PARALLEL +#pragma omp parallel for num_threads(cores) schedule(static) if (MULTIMINSIZE(size)) +#endif + for (Long i=0; iD != NULL); + F77amuxmat(&size, &size, &one, RHS, pt->D, pt->lnz, + pt->xja, pt->xlnz); + for (Long i=0; iD[pt->invp[i]]; + } + break; + + case Diagonal : { + Long i, j, + sizeP1 = size + 1; + double *D = pt->result; + assert(D != NULL); + for (i=j=0; jmethod %d\n", pt->method); + BUG; + } + + return NOERROR; +} diff --git a/src/miraculix/transformUint.cc b/src/miraculix/transformUint.cc index 0ec23b36..66f4ea99 100755 --- a/src/miraculix/transformUint.cc +++ b/src/miraculix/transformUint.cc @@ -466,7 +466,6 @@ void transform(unit_t *Old, Long OldSnps, Long OldIndiv, Long OldLDA, tmp = (unit_t*) algn_generalL( (int*)(tmp##_unaligned = (unit_t*) \ CALLOC((inUnits)+safety, BytesPerUnit)), \ MAX_LDA_BITALIGN ) - if (isHaplo(OldCoding)) { // printf("!!haplo\n"); @@ -987,6 +986,7 @@ void transposeIntern(unit_t * Code, Long snps, Long individuals, // printf("k=%d %d\n", k, rest_indiv > 0); if (k) { zeros = (unit_t*) CALLOC(real_units[0] + 1, BytesPerUnit); //real_units=[0]=0 moeglich! + assert(zeros != NULL); Code0 = Code + Lda * indiv ; Code1 = codePerByte > 1 ? Code0 + Lda : zeros; Code2 = codePerByte > 2 ? Code1 + Lda : zeros; diff --git a/src/miraculix/xport_import_rrfu.cc b/src/miraculix/xport_import_rrfu.cc new file mode 100755 index 00000000..7ef2d65b --- /dev/null +++ b/src/miraculix/xport_import_rrfu.cc @@ -0,0 +1,51 @@ +/* + Authors + Martin Schlather, martin.schlather@uni-mannheim.de + + Copyright (C) 2022-2023 Martin Schlather + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + + + +#include "Basic_RFUlocal.h" +#include "extern_RFU.h" + + +void startRFU() { + // utilsoption_type *utils = &OPTIONS; + union { + unsigned short a; + unsigned char b[2]; + } ab; + ab.a = 0xFF00; + bool bigendian = ab.b[0] != 0; + assert((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) xor bigendian); + OPTIONS.basic.bigendian = bigendian; + + volatile uni64 U64; + U64.u32[bigendian] = 1954; + U64.u32[!bigendian] = 0x7ff00000; + OPTIONS.basic.NA = U64.d8[0]; + OPTIONS.basic.NaN = 0.0 / 0.0; + +#if defined compatibility_to_C_h + ownNA = OPTIONS.basic.NA; + ownNaN = OPTIONS.basic.NaN; +#endif + +} + + + diff --git a/tests/dgemm_compressed/test.jl b/tests/dgemm_compressed/test.jl index b28d1c95..a9f84169 100644 --- a/tests/dgemm_compressed/test.jl +++ b/tests/dgemm_compressed/test.jl @@ -35,7 +35,7 @@ include(MODULE_PATH) println("Load library and set options") miraculix.dgemm_compressed.set_library_path(LIBRARY_PATH) miraculix.dgemm_compressed.load_shared_library() -miraculix.dgemm_compressed.set_options(use_gpu=!false) +miraculix.dgemm_compressed.set_options(use_gpu=false, verbose=0) println("Read bed file and frequencies") genotype_data, n_snps, n_indiv = miraculix.read_plink.read_bed(DATA_FILE)