From 0c86c56e8e1f58b67dbdd759298aabad35711603 Mon Sep 17 00:00:00 2001
From: Alexander Freudenberg
 <50404848+alexfreudenberg@users.noreply.github.com>
Date: Fri, 26 May 2023 08:39:52 +0200
Subject: [PATCH] Merge Martin's version 25/05/23 (#6)

Merge changes to core miraculix by Martin
---
 .gitignore                            |    2 +
 src/miraculix/1bit128.cc~             |  171 --
 src/miraculix/2bitUint.cc             |    2 +-
 src/miraculix/3bitUint.cc             |    2 +-
 src/miraculix/5codes512.cc            |    8 +-
 src/miraculix/5codesAPI.c             |    6 +-
 src/miraculix/5codesChar.cc~          |  434 -----
 src/miraculix/5codesIntern.h          |  193 ++-
 src/miraculix/5codesUint.cc           |   24 +-
 src/miraculix/Files32.cc              |    8 +-
 src/miraculix/FilesUint.cc            |    3 +-
 src/miraculix/MXinfo.h                |    1 +
 src/miraculix/MoBPS_R.cc              |    2 -
 src/miraculix/Template.h              |   14 +-
 src/miraculix/Vector.matrix.Uint.cc   |    2 +-
 src/miraculix/compatibility.C.h       |    6 +-
 src/miraculix/compatibility.R.h       |    6 +-
 src/miraculix/compatibility.c.cc      |    7 +
 src/miraculix/compatibility.general.h |    3 +-
 src/miraculix/compatibility.sexp.cc   |    2 +-
 src/miraculix/haplogeno.cc            |    8 +-
 src/miraculix/intrinsics.h            |    5 +-
 src/miraculix/intrinsicsIntern.h      |    4 +-
 src/miraculix/kleinkram.cc            |    1 -
 src/miraculix/main.cc                 |   89 +-
 src/miraculix/makefile.c.mk           |    7 +-
 src/miraculix/makefile.miraculix.mk~  |   22 -
 src/miraculix/makefile.rfu.mk         |    2 +-
 src/miraculix/makefile.rfu.mk~        |   24 -
 src/miraculix/options_rrfu.cc         |  284 ++++
 src/miraculix/parallel_simd.h         |    3 +-
 src/miraculix/plink.h                 |    2 +-
 src/miraculix/plink256.cc             |    4 +-
 src/miraculix/plinkUint.cc            |    7 +-
 src/miraculix/solve_rrfu.cc           | 2223 +++++++++++++++++++++++++
 src/miraculix/transformUint.cc        |    2 +-
 src/miraculix/xport_import_rrfu.cc    |   51 +
 tests/dgemm_compressed/test.jl        |    2 +-
 38 files changed, 2802 insertions(+), 834 deletions(-)
 delete mode 100755 src/miraculix/1bit128.cc~
 delete mode 100755 src/miraculix/5codesChar.cc~
 delete mode 100755 src/miraculix/makefile.miraculix.mk~
 delete mode 100755 src/miraculix/makefile.rfu.mk~
 create mode 100755 src/miraculix/options_rrfu.cc
 create mode 100755 src/miraculix/solve_rrfu.cc
 create mode 100755 src/miraculix/xport_import_rrfu.cc

diff --git a/.gitignore b/.gitignore
index 6c104721..6dd69f46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@
 *.freq
 *.bed
 *.mem
+*.*~
+
 
 *.optrpt
 *.tgz
diff --git a/src/miraculix/1bit128.cc~ b/src/miraculix/1bit128.cc~
deleted file mode 100755
index fd698dce..00000000
--- a/src/miraculix/1bit128.cc~
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- Authors 
- Guido Moerkotto
- maintained and modified by Martin Schlather, martin.schlather@uni-mannheim.de
-
- Copyright (C) 2022-2023 Guido Moerkotte, Martin Schlather
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-
-
-#define BitsPerCode 1
-#define MY_VARIANT 128
-
-#include "Basic_miraculix.h"
-#include "intrinsics_specific.h"
-#include "xport_import.h"
-#include "options.h"
-//
-//
-
-#if defined SSSE3 
-
-ASSERT_SIMD(1bit128, ssse3);
-
-#include "MX.h"
-#include "haplogeno.h"
-#include "intrinsicsIntern.h"
-#include "1bitIntern.h"
-
-
-
-void trafo1Geno2Geno128(unit_t *OLD, Long snps, Long individuals,
-			Long oldLDA,  unit_t *Ans, Long ldAns) {
-  assert(2 * oldLDA == ldAns);
-  
-  Long blocks = Blocks(snps);
-
-  if (OLD == Ans) {
-    ERR0("not programed yet");
-    // (Braucht nur 1/4 Speicherplatz extra)
-    // (i)drittes Viertel rauskopieren + 1-2 Spalten mehr vorne freilasen
-    // (ii) haelfte durcharbeitne
-    // (iii) drittel Vierel aus speicher + in die Spalte(n) davor im
-    //       Speicher viertel Viertel spaltenweise reinkopieren
-    // (iv) letztes Viertel aus dem Speicher mit den 1-2 Spalten frueher
-    //      abarbeiten.
-  }	
-  
-   
-  const BlockType 
-    O2F2 = SET32(0x00FF00FF),
-    F2O2 = SET32(0xFF00FF00),
-    
-    O4F4 = SET32(0x0000FFFF),
-    F4O4 = SET32(0xFFFF0000);
-
-  
-#if ! defined PlainInteger32
-  const BlockType O8F8 = SET64(0x00000000FFFFFFFF);
-#endif	
-
-#if defined AVX512F
-  const BlockType cross_idx =
-    _mm512_set_epi32(TAKE_B + 14, TAKE_B + 12, TAKE_B + 10, TAKE_B + 8,
-		     TAKE_B + 6, TAKE_B + 4, TAKE_B + 2, TAKE_B + 0,
-		     14, 12, 10, 8, 6, 4, 2, 0);
-#elif defined AVX2
-  const BlockType cross_idxA = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
-  const BlockType cross_idxC = _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4);
-  const int halfreg_idx = 0 + (2 << 4);
-#elif defined SSE2
-  const int cross_idxA =  0 + (1 << 4);
-  const int cross_idxC = 2 + (3 << 4);
-#elif defined PlainInteger64 || defined PlainIteger32
-  #define cross_idxA 0
-  #define cross_idxC 1
-#endif
-
- 
-
-  for (Long j=0; j<individuals; j++) {
-    unit_t *New = Ans + j * ldAns,
-      *halfmatrix = New + individuals * ldAns,
-    *Old = OLD + j * oldLDA;
-    
-    for (Long i=0; i<blocks; i++) {	
-      const BlockType a0 = LOAD((BlockType0*) Old);
-      const BlockType b0 = LOAD((BlockType0*) halfmatrix);
-      
-      const BlockType a1 = CROSSLINE_PERMUTE(a0, cross_idxA);
-      const BlockType b1 = CROSSLINE_PERMUTE(b0, cross_idxA);
-      const BlockType c1 = CROSSLINE_PERMUTE(a0, cross_idxC);
-      const BlockType d1 = CROSSLINE_PERMUTE(b0, cross_idxC);
-
-    
-#if defined PlainInteger32
-      const BlockType a5 = a1,
-	b5 = b1,
-	c5 = c1,
-	d5 = d1;
-#else
-      AND4_C(2, 1, O8F8);
-      AND4_C(3, 2, F4O4);
-      SHL32_4(4, 3, 16);
-      OR4(5, 4, 2);    
-#endif
-    
-      AND4_C(6, 5, O4F4);
-      AND4_C(7, 6, F2O2);
-      SHL32_4(8, 7, 8);
-      OR4(9, 8, 6);
-      
-      AND4_C(10, 9, O2F2);
-      AND4_C(11, 10, F1O1);
-      SHL32_4(12, 11, 4);
-      OR4(13, 12, 10);
-      
-      AND4_C(14, 13, O1F1);
-      AND4_C(15, 14, E2N2);
-      SHL32_4(16, 15, 2);
-      OR4(17, 16, 14);
-     
-      AND4_C(18, 17, N2E2);
-      AND4_C(19, 18, E1N1);
-      SHL32_4(20, 19, 1);
-      OR4(21, 20, 18);
-            
-      AND4_C(22, 21, N1E1);
-      const BlockType c23 = SHL32(c22, 1);
-      const BlockType d23 = SHL32(d22, 1);
-      const BlockType a23 = OR(a22, c23);
-      const BlockType b23 = OR(b22, d23);
-       
-      *((BlockType0*) New) = a23;
-      //      *((BlockType0*) (New + UnitsPerBlock)) = c23; // OK
-     *((BlockType0*) (New + UnitsPerBlock)) = b23; // OK
-
-      Old += UnitsPerBlock;// OK
-      halfmatrix += UnitsPerBlock;// OK
-      New += 2 * UnitsPerBlock;// OK
-    } // for i
-  } // for j
-}
-
-SCALAR012(128)	
-
-
-
-
-#else // !defined AVX
-#include "Template.h"
-#include "1bit.h"
-#include "avx_miss.h"
-
-
-  DeclareVersion1(128,Su,Sv)
-  SIMD_MISS(1bit128, ssse3);
-
-#endif
diff --git a/src/miraculix/2bitUint.cc b/src/miraculix/2bitUint.cc
index 5d9b6d8d..029ed993 100755
--- a/src/miraculix/2bitUint.cc
+++ b/src/miraculix/2bitUint.cc
@@ -305,7 +305,7 @@ if (gV) vD = variant < 256 ? genoVector2_double : genoVector2v256_double;
 sparseTGeno_header(2Bit) {
   BUG;
   // unsinns aufruf!!:
-   sparseTGenoOrigPlink(code, nrow_code, ncol_code,
+   sparseTGenoPlink(code, nrow_code, ncol_code,
 		    ldaInByte, coding,
 		    valueB, nIdx, rowIdxB, colIdxB, tSparse,
 		    global, utils, 
diff --git a/src/miraculix/3bitUint.cc b/src/miraculix/3bitUint.cc
index 451f0618..444965da 100755
--- a/src/miraculix/3bitUint.cc
+++ b/src/miraculix/3bitUint.cc
@@ -56,8 +56,8 @@ void initiate_tableI(table_type **table, int tableSize,
 
   int d;
   int *nx = (int *) CALLOC(codesPerPartUnit, sizeof(int));
-  
   *table = (table_type*) CALLOC(tableSize, sizeof(table_type));
+  
   table_type* pt = *table;
 
   while (true) {
diff --git a/src/miraculix/5codes512.cc b/src/miraculix/5codes512.cc
index c2d62eed..2d4f9f10 100755
--- a/src/miraculix/5codes512.cc
+++ b/src/miraculix/5codes512.cc
@@ -32,10 +32,10 @@
 #include "5codes.h"
 
 
-#if defined AVX2    
+#if defined AVX512
 
 
-ASSERT_SIMD(5codes512, avx2);
+ASSERT_SIMD(5codes512, avx512f);
 
 #include "MX.h"
 #include "haplogeno.h" 
@@ -47,13 +47,13 @@ ASSERT_SIMD(5codes512, avx2);
 
 #else
 #include "avx_miss.h"
-  
+
    
 gV5_header(512, floatD, LongDouble, float, double) Sv
 gV5_header(512, double, LongDouble, double, double) Sv
 
 
-SIMD_MISS(5codes512, avx2);
+SIMD_MISS(5codes512, avx512f);
 
 #endif  // defined AVX
 // trafo2Geno1Geno128
diff --git a/src/miraculix/5codesAPI.c b/src/miraculix/5codesAPI.c
index 6216583a..bf7d4729 100755
--- a/src/miraculix/5codesAPI.c
+++ b/src/miraculix/5codesAPI.c
@@ -86,7 +86,7 @@ void plink2compressed(char *plink,
 
   // setOptions_compressed(0, 6, 0, 0, 0, 0, 0, 0, 1);
 
-   int nbytes = indiv /4;   printf("p2c snps=%d indiv=%d max_n=%d bytes/indiv=%d\n", snps, indiv, max_n, nbytes);
+   int nbytes = indiv /4;   //printf("p2c snps=%d indiv=%d max_n=%d bytes/indiv=%d\n", snps, indiv, max_n, nbytes);
   // exit(99);
  //  printf("c byte 1:5 = %d %d %d %d %d\n", *plink, *(plink+1), *(plink+2), *(plink+3), *(plink+4));
 
@@ -147,10 +147,10 @@ void sparse_times_plink(char *transsparse,// N: matrix as is (sparse row format)
 
   if (is(transcompressed)) {
     int tmp = snps; snps=indiv; indiv=tmp;
-    plink_transposed = plink;
+    plink = plink_transposed;
   }
   
-  sparseTGenoPlinkApi(plink_transposed, snps, indiv,
+  sparseTGenoPlinkApi(plink, indiv, snps, 
 		      B, nIdx, rowIdxB, colIdxB, is(transsparse), C, Ldc);
 
   return;
diff --git a/src/miraculix/5codesChar.cc~ b/src/miraculix/5codesChar.cc~
deleted file mode 100755
index 88707e70..00000000
--- a/src/miraculix/5codesChar.cc~
+++ /dev/null
@@ -1,434 +0,0 @@
-
-/*
- Authors 
- Martin Schlather, martin.schlather@uni-mannheim.de
-
- Copyright (C) 2022-2023 Martin Schlather
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-#include "Basic_miraculix.h"
-#include "xport_import.h"
-#include "options.h"
-#include "haplogeno.h"
-#include "MXinfo.h"
-#include "Template.h"
-#include "5codes.h"
-#include "plink.h"
-#include "Vector.matrix.h"
-#include "GPUapi.h"
-#include "intrinsics_specific.h"
-// #include "compa
-
-
-#define MY_VARIANT 8
-
-
-const Long CpByte = GetCodesPerUnit(FiveCodes) / BytesPerUnit;  // OK
-const Long CpByteplink = GetCodesPerUnit(Plink) / BytesPerUnit; // OK
-const Long BitsPerCodeplink = BitsPerByte / CpByteplink;
-extern Uchar PLINK2FIVE[1024];
-extern Uint ALLELESUM[243];
- 
-
-
-#if !defined CUDA
-void plink2gpu(char VARIABLE_IS_NOT_USED  *plink, // including three-byte
-	       // header, size in bytes: ceiling(indiv/4) * snps + 3
-    char VARIABLE_IS_NOT_USED *plink_transposed, // @JV: If I remember correctly, you need to
-                            // transpose the matrix anyway? so it should be
-                            // easier to have this as an argument, and not
-                            // transpose it again on the GPU
-    int VARIABLE_IS_NOT_USED snps,
-    int VARIABLE_IS_NOT_USED indiv, // actual number of individuals, not N/4 rounded up
-	       double VARIABLE_IS_NOT_USED *f,  // vector guaranteed to be of length  dsnps
-    int VARIABLE_IS_NOT_USED n, 
-    void VARIABLE_IS_NOT_USED **GPU_obj) {
-  BUG;
-    }
-void dgemm_compressed_gpu(
-    bool VARIABLE_IS_NOT_USED trans, //
-    void VARIABLE_IS_NOT_USED *GPU_obj,
-    //    double *f,
-    int VARIABLE_IS_NOT_USED n,     // number of columns of matrix B
-    double VARIABLE_IS_NOT_USED *B,  // matrix of dimensions (nsnps, n) if trans =
-                // "N" and (nindiv,n) if trans = "T"
-    int VARIABLE_IS_NOT_USED ldb,
-    int VARIABLE_IS_NOT_USED centered,
-    int VARIABLE_IS_NOT_USED normalized,
-    double VARIABLE_IS_NOT_USED *C,
-    int VARIABLE_IS_NOT_USED ldc
-			  ) {
-  BUG;
-}
-void freegpu(void VARIABLE_IS_NOT_USED **GPU_obj){
-  BUG;
-}
-#endif
-
-
-void print_options(option_type *global, utilsoption_type *utils) {
-   if (utils->basic.Cprintlevel) {
-     const char *ft [] ={"false", "true", "tba", "tba"};
-     int G = (int) global->tuning.gpu << 1;
-     PRINTF("\n");     
-     PRINTF("gpu      : %s\n", ft[global->tuning.gpu]);
-     if (!global->tuning.gpu) {
-       if ( global->genetics.centered != NoCentering)
-	 PRINTF("ignore missings: %s\n", ft[G + global->tuning.missingsFully0]);
-       PRINTF("do center: %s\n", ft[G + (global->genetics.centered != NoCentering)]);
-       if (global->tuning.floatLoop >=0)
-	 PRINTF("float    : %s\n", ft[G + global->tuning.floatLoop > 0]);
-       else PRINTF("precise  : true\n");
-       PRINTF("meanV    : %s\n", ft[G + global->tuning.meanVsubstract]);
-       PRINTF("meanSxI  : %s\n", ft[G + global->tuning.meanSxIsubstract]);
-       PRINTF("normalize: %s\n", ft[G + global->genetics.normalized] );
-        PRINTF("variant  : %d \n",   global->tuning.variant);
-     }
-     PRINTF("use fortran freq: %s\n", 
-	    ft[G + global->genetics.prefer_external_freq] );
-     PRINTF("cores    : %d \n",  utils->basic.cores );
-     PRINTF("print lvl: %d \n",  utils->basic.Cprintlevel );
-   }
-}
-
-
-
-void get_started(option_type **g, utilsoption_type **u) {
-  static option_type *global = NULL;
-  static utilsoption_type *utils = NULL;
-  if (global == NULL) {
-    startMiraculix(1);
-    WhichOptionList(false, &global, &utils);
-    
-    //    printf(">>>>>>>>> global %ld\n", (Long) global);
-    
-    setOptions5(
-#if defined CUDA
-		true,
-#else
-		false,
-#endif
-		0, // cores 
-		false, // floatLoop
-		false, // meanV
-		false, // meanSxI
-		false, // irgnore missings
-		RowMeans, // centering
-		NoNormalizing, // normalize
-		true, // use Fortran freq
-		32,
-		false // print details
-		);
-    print_options(global, utils);
-    Ext_startRFU();
-    //   printf(" >> "); //, (Long) (&OPTIONS), (Long) utils;
-  }
-  *g = global;
-  *u = utils;
- assert(global != NULL);
-  assert(utils != NULL);
-  //  printf("getstarted_done %ld\n", (Long) global);
-  if (utils->basic.Cprintlevel > 2) {
-    PRINTF("<<<<< global %ld\n", (Long) global);
-    print_options(global, utils);
-  }
-}
-
-
-
-void setOptions5(int gpu, int cores, int floatLoop,
-		 int meanVsubstract, int meanSxIsubstract,
-		 int missingsFully0,
-		 int centered, int normalized,
-		 int prefer_external_freq,
-		 int variant, int print_details) {
-  if (print_details > 10) PRINTF("entering setoptions cores = %d\n", cores);
-  option_type *global;
-  utilsoption_type *utils;   
-  get_started(&global, &utils);
-  if (print_details > 10) PRINTF("B %ld %ld cores = %d\n", (Long) global, (Long) utils, cores);
-  tuning_options *tuning = &(global->tuning);
-  genetics_options *genetics = &(global->genetics);
-  if (cores == 0) {
-    const char* s = getenv("OMP_NUM_THREADS");
-    if (s != NULL) cores = (int) strtoimax(s, NULL, 10);
-    if (cores <= 0) {
-      PRINTF("WARNING: seeing 'OMP_NUM_THREADS=%s' only -- #threads set to 4\n", s);
-       cores = 4;
-       //PRINTF("number of threads not positive");  BUG;
-    }
-  }
-  if (print_details > 10) PRINTF("cores = %d\n", cores);
-  utils->basic.cores = cores;
-  tuning->gpu = (bool) gpu;
-   if (gpu && 
-       (!prefer_external_freq || missingsFully0 || centered || normalized))
-    ERR("in case of 'gpu' the Fortran frequency must always be used; missings/centering/normalizing cannot treated.");
-  tuning->floatLoop = floatLoop;
-  tuning->meanVsubstract = (bool) meanVsubstract;
-  tuning->meanSxIsubstract = (bool) meanSxIsubstract;
-  tuning->missingsFully0 = (bool) missingsFully0;
-  genetics->centered = (centering_type) centered;
-  genetics->prefer_external_freq = prefer_external_freq;
-  genetics->normalized = (normalizing_type) normalized;
-  tuning->variant = variant < 32 ? 32 : variant > 256 ? 256 : variant;
-  utils->basic.efficient = false;
-  utils->basic.Cprintlevel = print_details * 5;
-  tuning->addtransposed = true;
-  if (print_details > 10) {
-    print_options(global, utils);
-    PRINTF("setoptions5 done \n");
-  }
-}
-
-
-#define TBM 0x03
-void plink2Geno5codestrans32(Uchar *M, Long snps, Long individuals,
-			     int VARIABLE_IS_NOT_USED cores,
-			     unit_t *Ans, Long ldAns) { 
-  // printf("p25T enter\n");
-  if (sizeof(Uchar) * BitsPerByte != MY_VARIANT) BUG;
-  Uchar *table = PLINK2FIVE;
-  
-  Long
-    ldM = 1L + (individuals - 1L) / CpByteplink, // in bytes!!!
-    end_snpsM1_CpB = (snps - 1L) / CpByte;
-  
-#ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static)
-#endif
-  for (Long i=0; i<=end_snpsM1_CpB; i++) {
-    Long rest = CpByte;
-    Uchar *pM =  M + ldM * CpByte * i;
-    Uchar *a0 = ((Uchar*) (Ans + ldAns * i )); // !!
-    Uchar h[BitsPerByte]; // CpByte <= BitsPerByte !
-    
-    if (i < end_snpsM1_CpB) {} else {
-      rest = snps - end_snpsM1_CpB * rest;		
-      assert(rest > 0);
-      for (Long ll=rest; ll<CpByte; ll++) h[ll] = 0;
-    }
-    
-    for (Long j=0; j<ldM; j++, pM++) {
-      Uchar *a = a0 + j * CpByteplink;
-      for (Long z=0; z<rest; z++) h[z] = pM[z * ldM];
-      for (Long k=0; k<CpByteplink; k++) {
-	const Uint m = ((Uint) (h[0] & TBM)) |
-	  (((Uint) (h[1] & TBM)) << 2) |
-	  (((Uint) (h[2] & TBM)) << 4) |
-	  (((Uint) (h[3] & TBM)) << 6) |
-	  (((Uint) (h[4] & TBM)) << 8);
-	a[k] = table[m];
-	for (Long ll=0; ll<CpByte; ll++) h[ll] >>= BitsPerCodeplink;
-      }
-    }
-  }
-  //  printf("p25T done\n");
-}
-
-
-#if defined LONG_HANGING_TYPE
-#undef LONG_HANGING_TYPE
-#endif
-//#define LONG_HANGING_TYPE Long
-//#define HTT_BITS 64
-#define LONG_HANGING_TYPE Long
-#define HTT_BITS 64
-#define ORIGBITS_TRANS_MASK 0x00000000000003FF
-// irgendwo fehler drin -- sollte eigentlich bisschen schneller gehen
-void plink2Geno5codes32(Uchar *M, Long snps, Long individuals,
-			int VARIABLE_IS_NOT_USED cores,
-			unit_t *Ans, Long ldAns) {
-  //  printf("p25 enter\n");
-  Uchar *table = PLINK2FIVE;
-  Long ldM = 1L + (individuals - 1L) / CpByteplink; // in bytes!!!
-  Long end_indiM1_CpB = (individuals -1L) / CpByte;
-  Long ldaByte = ldAns * BytesPerUnit;
-  Long totalBytes = ldM * snps;
-
-  if (MY_LDABITALIGN_2BIT < sizeof(Long)) BUG;
-  if (HTT_BITS != sizeof(LONG_HANGING_TYPE) * BitsPerByte) BUG;
-  
-#ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static)
-#endif
-  for (Long i=0; i<snps; i++) {
-    //    printf("i=%ld %d\n", i, snps);
-    int availableBits = 0;
-    LONG_HANGING_TYPE hanging = 0,
-      m = 0;
-    int hangingBits = 0;
-    Uchar *a0 = ((Uchar*) Ans) + i; // !!
-    Long byteNr = i * ldM;
-    LONG_HANGING_TYPE *pM = (LONG_HANGING_TYPE*) (M + byteNr);
-    for(Long j=0; j<=end_indiM1_CpB; j++) {      
-      //  printf("i=%ld %d j=%ld %d ldM=%d %d\n", i, snps, j,end_indiM1_CpB, ldM, individuals);
-      // i=127 128 j=224 225 ldM=282 1128
-      Uchar *a = a0 + ldaByte * j;					
-      while (availableBits < ORIGBITSperFIVE) {
-	if (hangingBits == 0) {
-	  byteNr += sizeof(LONG_HANGING_TYPE); // endpoint after loading
-	  // printf("hanging %d <= %d \n", byteNr,totalBytes );
-	  if (byteNr <= totalBytes) {
-	    // printf(".");
-	    // inbetween just (non-annoying) nonsense is additionally loaded
-	    // if ldM is passed by *pM
-	    hanging = *(pM++);
-	    hangingBits = HTT_BITS;
-	  } else {
-	    // at the very end, instead of nonsense loading, we would have
-	    // a segmenation fault
-	    byteNr -= sizeof(LONG_HANGING_TYPE);
-	    int left =  (int) (totalBytes - byteNr);
-	    for (int k=0; k < left; k++) {
-	      //     printf("k=%d left = %d %d\n", k, left, (int) (((Uchar*) pM)[k]));
-	      //   printf("k=%d left = %d %d\n", k, left, (int) ( ((Uchar*) (&hanging))[k]));
-	      ((Uchar*) (&hanging))[k] = ((Uchar*) pM)[k];
-	    }
-	    hangingBits = left * BitsPerByte;
-	  }
-	}
-	//	printf("%d < %d\n", availableBits ,  ORIGBITSperFIVE);
-	m |= hanging << availableBits;
-	int movedBits = hangingBits <= HTT_BITS-availableBits
-	  ? hangingBits : HTT_BITS - availableBits;
-	//	printf("i=%ld j=%ld %d %ld %d  %d<%d-%d\n", i, j, hangingBits, (Long) M - (Long) pM, movedBits, hangingBits, sizeof(LONG_HANGING_TYPE), availableBits);
-	availableBits += movedBits;
-	hangingBits -= movedBits;
-	hanging >>= movedBits;
-      }
-      *a = table[m & ORIGBITS_TRANS_MASK];
-      m >>= ORIGBITSperFIVE;
-      availableBits -= ORIGBITSperFIVE;
-    }
-  }	
-  //  printf("p25 done\n");
-}
-
-		   
-void plinkTo5(char *plink, char *plink_transposed, 
-	      Long snps, Long indiv,
-	      double *f, 
-	      int max_n,
-	      void**compressed) {
-  //   printf("pto 5 do\n");
- option_type *global;
-  utilsoption_type *utils;   
-  get_started(&global, &utils);
-  stopIfNotInt(snps | indiv);
-  if (global->tuning.gpu) {
-    plink2gpu((char*) plink, (char*) plink_transposed,
-	      (int) snps, (int) indiv, f, max_n,
-	      compressed);
-    return;
-  } 
-
-  basic_options *opt = &(utils->basic);
-  SEXP SxI = PROTECT(CreateEmptyCodeVector(snps, indiv, FiveCodes,
-					   global->tuning.variant, 0, false,
-					   global, utils));  
-  
-  *compressed = (void*) SxI;
-  int cores = GreaterZero(opt->cores);
-  unit_t *code = Align(SxI, opt);
-  Long *info = GetInfo(SxI);
-  Long ldAns = info[LDA];
- 
-  if (global->tuning.missingsFully0) getPlinkMissings((Uchar*) plink, SxI, opt);
-  //  printf("coding ...\n");
-  plink2Geno5codes32((Uchar*) plink, snps, indiv, cores, code, ldAns);
-  
-  SEXP next = getAttribPointer(SxI, Next);
-  unit_t *codenext = Align(next, opt);
-  Long *infonext = GetInfo(next),
-    ldanext = infonext[LDA];
-  
-  //printf("trans coding ...\n");
-  plink2Geno5codestrans32((Uchar*)plink, snps, indiv, cores, codenext, ldanext);
-  if (global->genetics.prefer_external_freq) {
-    getFreq(SxI, f, global, utils);
-  }
-  // printf("plinkTo5 done\n");
-}
-
-
-void vectorGeno5api(void *compressed, double *V, Long repetV, Long LdV, 
-		    double *ans, Long LdAns) {
-  option_type *global;
-  utilsoption_type *utils;
-  get_started(&global, &utils);
-  if (global->tuning.gpu) {
-    //
-   stopIfNotInt(repetV | LdV | LdAns);
-   dgemm_compressed_gpu(false, compressed, (int) repetV, V, (int) LdV,
-			 global->genetics.centered, 
-			 global->genetics.normalized, 
-			ans, (int) LdAns);
- } else {
-    vectorGeno_means_double((SEXP)compressed, V, repetV, LdV, global, utils,
-			    ans, LdAns);
-  }
-}
-
-
-void genoVector5api(void *compressed, double *V, Long repetV, Long LdV, 
-		 double *ans, Long LdAns) {
-  option_type *global;
-  utilsoption_type *utils;
-  get_started(&global, &utils);
-  if (global->tuning.gpu) {
-    stopIfNotInt(repetV | LdV | LdAns)
-      dgemm_compressed_gpu(true, compressed, (int) repetV, V, (int) LdV, 
-			 global->genetics.centered, 
-			 global->genetics.normalized,
-			   ans, (int) LdAns);
-  } else {
-    genoVector_means_double((SEXP)compressed, V, repetV, LdV, global, utils,
-			    ans, LdAns);
-  }  
-} 
-
-
-
-void free5(void **compressed) {
-  option_type *global;
-  utilsoption_type *utils;   
-  get_started(&global, &utils);
-  if (global->tuning.gpu) {
-    freegpu(compressed);
-  } else {
-    FREE_SEXP((SEXP*) compressed); // will crash?!!!
-  }
-}
-
-
-void getFreq5(void *compressed,  double *f) {
-  SEXP SxI = (SEXP) compressed;
-  //   printINFO(SxI);
-  option_type *global;
-  utilsoption_type *utils;
-  get_started(&global, &utils);
-  Long *info = GetInfo(SxI);
-  Long
-    snps = info[SNPS],
-    rows = snps;
-  //  Long individuals = info[INDIVIDUALS];
-  getFreq(SxI, global, utils);
-  LongDouble *freq = getFreq(SxI);
-   for (Long j=0; j<rows; j++) {
-    f[j] = (double) freq[j];
-    //   printf("j=%d %f\n", j, f[j]);
-  }
-}
diff --git a/src/miraculix/5codesIntern.h b/src/miraculix/5codesIntern.h
index 01708ea1..8c20f795 100755
--- a/src/miraculix/5codesIntern.h
+++ b/src/miraculix/5codesIntern.h
@@ -52,6 +52,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 //#define AlteVersion 1
 #define AlteVersion 0
 
+#define coreFactor 5 // 0.25 // oder 0.2
   
 #define gV5_start0(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
   int cores = GreaterZero(opt->cores); /* NOT Long !! */      		\
@@ -59,7 +60,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
     char msg[200];							\
     SPRINTF(msg, "%s Bit %s-%s required for %ld x %ld x %ld", 		\
 	    #NR, #TRDTYPE, #ENDTYPE, rows, cols, repetV);		\
-    PRINTF("%s", msg); for (int i=(int) strlen(msg); i<57; i++) PRINTF(" "); \
+    PRINTF("%s", msg); for (int i=(int) STRLEN(msg); i<57; i++) PRINTF(" "); \
     PRINTF(" -> %3ld Bit, %2d cores\n",					\
 	   sizeof(AVXTYPE) * BitsPerByte, cores);			\
   }									\
@@ -68,9 +69,10 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
   const Long colsCpB = DIV_GEQ(cols, CpB);				\
   const Long colsCpBm1 = colsCpB - 1;					\
   const Long colBlocks = DIV_GEQ(colsCpB,  AtOnce);			\
-  Long blockSliceLen = DIV_GEQ(colBlocks, cores * 5);			\
+  Long blockSliceLen = DIV_GEQ(colBlocks, cores * coreFactor);		\
+  if (false) printf("blockSliceLen=%ld\n", blockSliceLen);		\
   blockSliceLen = MAX(1, MIN(SLICELEN, blockSliceLen));			\
-  Long blocks = DIV_GEQ(colBlocks, blockSliceLen); /* (cores * 5) repetV */; \
+  Long blocks = DIV_GEQ(colBlocks, blockSliceLen); /* (ca. cores * 5) repetV */; \
   /* for large repetV, use repetV only for splitting */			\
   /* blocks = blocks > cores ? cores : blocks > 1 ? blocks : 1;	*/	\
   const Long blocksM1 = blocks - 1L;					\
@@ -100,22 +102,22 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 
 
 
-#if defined AVX2
+#if defined AVX2 || defined AVX512F
 #define gV5_start(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
   gV5_header(NR, TYPE, SNDTYPE, TRDTYPE, ENDTYPE) {		\
   const Long VatOnce = (sizeof(AVXTYPE) / sizeof(TRDTYPE));	\
-  if (VatOnce == 4) {						\
-    const Long restV = repetV % VatOnce;			\
-    if (restV > 0 && restV <= 2) {				\
-      vector_##TYPE##_t vD = restV == 1 ? genoVector5v32_##TYPE	\
-	: genoVector5v128_##TYPE;				\
-      vD(code, rows, cols, lda, coding, variant, pseudoFreq,	\
- 	 V, restV, ldV, opt, tuning, Ans, ldAns);		\
-      repetV -= restV;						\
-      V += restV * ldV;						\
-      Ans += restV * ldAns;					\
-    }								\
-  }								\
+  const Long restV = repetV % VatOnce;				\
+  if (restV > 0 &&						\
+      ((VatOnce == 8 && restV <= 4) || (VatOnce == 4 && restV <= 2))) {	\
+    vector_##TYPE##_t vD = restV == 1 ? genoVector5v32_##TYPE		\
+      : restV == 2 ? genoVector5v128_##TYPE				\
+      : genoVector5v256_##TYPE;						\
+    vD(code, rows, cols, lda, coding, variant, pseudoFreq,		\
+       V, restV, ldV, opt, tuning, Ans, ldAns);				\
+    repetV -= restV;							\
+    V += restV * ldV;							\
+    Ans += restV * ldAns;						\
+  }									\
   gV5_start0(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)
 #else
 #define gV5_start(NR, TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
@@ -125,7 +127,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 #endif
 
 
-#define gV5_LoopOne(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_CreateHash(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
   for (Long rr=0; rr < NVblocks; rr++) {				\
     AVXTYPE *f = F + rr * HashSizePerV;					\
     Long sEnd = MIN(VatOnce, repetV - rr * VatOnce);			\
@@ -133,7 +135,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
       TYPE *v_i = V + rr * VatOnce * ldV + i * CpB;			\
       AVXTYPE *hash0 = f + i * HashSize;				\
       int hastRest = CpB;						\
-      TYPE x[CpB] = {0, 0, 0, 0, 0};/* hier compiler fehler bei collapse!?! */ \
+      SNDTYPE x[CpB] = {0, 0, 0, 0, 0};/* hier compiler fehler bei collapse!?! */ \
       if (i < colsCpBm1) {} else {					\
 	hastRest = (int) (cols - colsCpBm1 * hastRest);			\
 	assert(hastRest > 0);						\
@@ -143,32 +145,35 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
       for (Long s=0; s<sEnd; s++) {					\
 	TYPE *v = v_i + ldV * s;					\
 	TRDTYPE *hash = ((TRDTYPE*) hash0) + s;				\
-	for (int k=0; k<hastRest; k++) {x[k] = v[k]; }			\
+	for (int k=0; k<hastRest; k++) {x[k] = (SNDTYPE) v[k]; }	\
 	if (pseudoFreq != NULL) {					\
 	  SNDTYPE meanLD = 0.0;						\
 	  for (int k=0; k<hastRest; k++)				\
-	    meanLD += (SNDTYPE) pseudoFreq[i * CpB + k] * (SNDTYPE)(2 * x[k]); \
+	    meanLD += (SNDTYPE) pseudoFreq[i * CpB + k] * (2 * x[k]); \
 	  mean = meanLD;						\
 	}								\
 	if (externalV != NULL) {					\
 	} else {							\
 	  for (Uint i4=0; i4<3; i4++) {					\
 	    Uint V4 = i4;						\
-	    SNDTYPE fct4 = (SNDTYPE) i4 * (SNDTYPE) x[4];		\
+	    SNDTYPE fct4 = (SNDTYPE) i4 * x[4] - mean;			\
 	    for (Uint i3=0; i3<3; i3++) {				\
 	      Uint V3 = 3 * V4 + i3;					\
-	      SNDTYPE fct3 = fct4 + i3 * x[3];				\
+	      SNDTYPE fct3 = fct4 + (SNDTYPE) i3 * x[3];	\
 	      for (Uint i2=0; i2<3; i2++) {				\
-		Uint V2 = 3 * V3 + i2;					\
-		SNDTYPE fct2 = (SNDTYPE)fct3 + (SNDTYPE) i2 * (SNDTYPE)x[2]; \
-		for (Uint i1=0; i1<3; i1++) {				\
-		  Uint V1 = 3 * V2 + i1;				\
-		  SNDTYPE fct1 = (SNDTYPE)fct2 + (SNDTYPE)i1 * (SNDTYPE)x[1]; \
-		  for (Uint i0=0; i0<3; i0++) {				\
-		    Uint V0 = 3 * V1 + i0;				\
-		    SNDTYPE fct0 =(SNDTYPE)fct1 + (SNDTYPE)i0 *(SNDTYPE)x[0]; \
-		    hash[V0 * VatOnce] = (TRDTYPE) (fct0 - mean); \
-		  }							\
+                Uint V2 = 3 * V3 + i2;					\
+		SNDTYPE fct2 = fct3 + (SNDTYPE) i2 * x[2];	\
+	   	for (Uint i1=0; i1<3; i1++) {				\
+		  Uint V1 = 3 * V2 + i1;      				\
+		  SNDTYPE fct1 = fct2 + (SNDTYPE) i1 * x[1];	\
+		  							\
+		  Uint V0 = 3 * V1 * VatOnce;				\
+		  SNDTYPE fct0 = fct1;			\
+		  hash[V0] = (TRDTYPE) fct0;			\
+		  V0+=VatOnce;						\
+		  fct0 += x[0];						\
+		  hash[V0] = (TRDTYPE) fct0;			\
+		  hash[V0 + VatOnce] = (TRDTYPE) (fct0 + x[0]);	\
 		}							\
 	      }								\
 	    }								\
@@ -176,7 +181,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 	}								\
       }									\
     }									\
-    }									\
+  }									\
   if (false) PRINTF("Hash end\n");					\
   const Long rowBlocks = MAX(1, DIV_LEQ(rows, RoughRowChunk));		\
   const Long RowChunk =  DIV_GEQ(rows, rowBlocks);			\
@@ -184,35 +189,65 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
   for (Long bStart=0; bStart<rows; bStart+=RowChunk) {			\
     Long bEnd = MIN(bStart + RowChunk, rows);				\
 
+/* 
+Aufwand fuer Erstellung der Hashtabelle kann deutlich reduziert wreden,
+wenn SNDTYPE==TRDTYPE
+(i)  i = 0
+     AVX-Vektor X0 mit Werten x[0] x VatOnce; 
+(ii) 3 AVX-Vektoren: (a) 0  (b) Xi (c) 2 * Xi
+(iii) i++
+(iv) AVX-Vektor Xi mit Werten x[i] x VatOnce erstellen
+(v)
+     (a) alle Vektoren kopieren auf die direkt nachfolgenden Speicherplaetze
+     (b) das Kopierte plus Xi
+     (c) Ergebnis kopieren auf die direkt nachfolgenden Speicherplaetze
+     (c) das Kopierte plus Xi
+(vi) weiter mit (iii) bis i=4 erreicht
+
+Von V lokal eine Kopie halten? 
+V umsortieren, so dass in Reihenfolge ausgelesen wird?
+*/
+  
+
+#define ccc 0
 
-#define gV5_Loop2A(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
-  for (Long C=0; C <= blocksM1; C++) {				\
-    for (Long rr=0; rr < NVblocks; rr++) {				\
-      AVXTYPE *ff = F + rr * HashSizePerV + sliceLen * C * HashSize; \
+#define gV5_LoopA(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+  for (Long C=0; C <= blocksM1; C++) {					\
+    for (Long rr=0; rr < NVblocks; rr++) { /* all repetV, as size is small*/ \
+      AVXTYPE *ff = F + rr * HashSizePerV + sliceLen * C * HashSize;	\
       AVXTYPE *t = Tmp + rr * blocksP1Xrows + rows * ( C);		\
       unit_t *c = code + lda * sliceLen * C;				\
       Long nrCols = C == blocksM1 ? rest : sliceLen;			\
       /* assert(nrCols > 0); */						\
-      for (Long i = 0 ; i<nrCols; i+=AtOnce) {				\
+      for (Long i = 0 ; i<nrCols; i+=AtOnce) {/* cols of compressed */	\
         Uchar *pC0 = (Uchar*) (c + i * lda);				\
-        AVXTYPE ff0[HashSize];						\
-	MEMCOPY(ff0, ff + (i + 0L) * HashSize, Hash_BytesPcol);		\
+	AVXTYPE ff0[HashSize];						\
+	if (true) MEMCOPY(ff0, ff + (i + 0L) * HashSize, Hash_BytesPcol); \
+	else MEMSET(ff0, ccc, Hash_BytesPcol);				\
 	Uchar *pC1 = (Uchar*) (c + (i+1L) * lda);			\
 	AVXTYPE ff1[HashSize];						\
-	MEMCOPY(ff1, ff + (i+1L) * HashSize, Hash_BytesPcol);		\
+	if (true) MEMCOPY(ff1, ff + (i+1L) * HashSize, Hash_BytesPcol); \
+	else MEMSET(ff1, ccc, Hash_BytesPcol);				\
 	Uchar *pC2 = (Uchar*) (c + (i+2L) * lda);			\
 	AVXTYPE ff2[HashSize];						\
-	MEMCOPY(ff2, ff + (i+2L) * HashSize, Hash_BytesPcol);		\
+	if (true) MEMCOPY(ff2, ff + (i+2L) * HashSize, Hash_BytesPcol); \
+	  else MEMSET(ff2, ccc, Hash_BytesPcol);			\
 	Uchar *pC3 = (Uchar*) (c + (i+3L) * lda);			\
 	AVXTYPE ff3[HashSize];						\
-	MEMCOPY(ff3, ff + (i+3L) * HashSize, Hash_BytesPcol);		\
-	for (Long b=bStart; b<bEnd; b++) {				\
-
+	if (true) MEMCOPY(ff3, ff + (i+3L) * HashSize, Hash_BytesPcol); \
+	else MEMSET(ff3, ccc, Hash_BytesPcol);				\
+	for (Long b=bStart; b<bEnd; b++) { /* rows of compressed */	\
+  
+/*
+  double bbb = i;							
+  if (false) for (Long jjj=0; jjj<1000; jjj++) bbb += EXP((double) i * jjj); 
+  char ccc = bbb > 0;							
+*/
 
 
 #if AtOnce == 4
 #if defined AVX2
-#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
     const AVXTYPE fC0 = AVXLOAD((TRDTYPE*)(ff0 + (int) pC0[b]));	\
     const AVXTYPE fC1 = AVXLOAD((TRDTYPE*)(ff1 + (int) pC1[b]));	\
     const AVXTYPE fC2 = AVXLOAD((TRDTYPE*)(ff2 + (int) pC2[b]));	\
@@ -224,7 +259,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
     AVXSTORE((TRDTYPE*)(t + b),  AVXADD( tb,  h));	\
   }
 #else
-#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
+#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)	\
     t[b] +=  (ff0[(int) pC0[b]] + ff1[(int) pC1[b]])		\
       + (ff2[(int) pC2[b]] + ff3[(int) pC3[b]]);		\
   }
@@ -232,7 +267,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 
 #elif AtOnce == 3
 #if defined AVX2
-#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
    const AVXTYPE fC0 = AVXLOAD((TRDTYPE*)(ff0 + (int) pC0[b]));	\
     const AVXTYPE fC1 = AVXLOAD((TRDTYPE*)(ff1 + (int) pC1[b]));	\
     const AVXTYPE fC2 = AVXLOAD((TRDTYPE*)(ff2 + (int) pC2[b]));	\
@@ -243,7 +278,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
     AVXSTORE((TRDTYPE*)(t + b), tb);					\
   }
 #else
-#define gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
     t[b] += ff0[(int) pC0[b]] + ff1[(int) pC1[b]] + ff2[(int) pC2[b]];	\
   }
 #endif
@@ -273,9 +308,9 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 #endif // AtOnce
     
     
-#define gV5_LoopTwo(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
-  gV5_Loop2A(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)			\
-    gV5_Loop2B(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_MainLoop(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+  gV5_LoopA(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)			\
+    gV5_LoopB(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
     }}									\
   }}									\
    if (false) PRINTF("calc end blocks=%ld\n", blocks);			\
@@ -283,7 +318,7 @@ Wageningen/run_gcc snps=150000 indiv=150000 repetV=32 cores=10 floatLoop=0 meanS
 
 #if AlteVersion == 0
 
-#define gV5_LoopSum(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_SumUp(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
   for (Long rr=0; rr < NVblocks; rr++) {				\
     AVXTYPE *tmp = Tmp + rr * blocksP1Xrows;				\
     Long level = rows;							\
@@ -308,7 +343,7 @@ if (false) PRINTF("end sum up\n");
 
 #else
 
-#define gV5_LoopSum(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_SumUp(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
   for (Long rr=0; rr < NVblocks; rr++) {				\
     Long rowsVatonce = rows * VatOnce;					\
     TRDTYPE *s = (TRDTYPE*) (sum + rr * blocksP1Xrows);			\
@@ -323,17 +358,17 @@ if (false) PRINTF("end sum up\n");
 	s[j] += (TRDTYPE) ((ENDTYPE) t0[j] + (ENDTYPE) t1[j] +		\
 			   (ENDTYPE) t2[j] + (ENDTYPE) t3[j]);		\
       }									\
-     }									\
+    }									\
   }									\
   if (false) PRINTF("!!! Old version of summing up (%ld, %ld)!!!\n", sizeof(TRDTYPE), sizeof(TRDTYPE));	        
-   BUG; // no bug at all, just do not use this LoopSum
+   BUG; // no bug at all, just do not use this SumUp
 
     
 #endif
 
     
 
-#define gV5_LoopEnd(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
+#define gV5_Sort(TYPE, SNDTYPE, TRDTYPE, AVXTYPE, ENDTYPE)		\
   for (Long rr=0; rr < NVblocks; rr++) {				\
     AVXTYPE *tmp = sum + rr * blocksP1Xrows;				\
     Long sEnd = MIN(VatOnce, repetV-rr * VatOnce);			\
@@ -351,32 +386,33 @@ if (false) PRINTF("end sum up\n");
   } // end function
  
 
+
+
 #if defined AVXLOAD
 #undef AVXLOAD
 #undef AVXSTORE
 #undef AVXADD
 #endif
-#define AVXLOAD LOADuDOUBLE
-#define AVXSTORE STOREuDOUBLE
-#define AVXADD ADDDOUBLE
-gV5_start(MY_VARIANT, double, LongDouble, double, Doubles, double)
+#define AVXLOAD LOADuFLOAT
+#define AVXSTORE STOREuFLOAT
+#define AVXADD ADDFLOAT
+gV5_start(MY_VARIANT, floatD, LongDouble, float, Floats, double)
 #ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static)
+#pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopOne(double, LongDouble, double, Doubles, double)
+gV5_CreateHash(floatD, LongDouble, float, Floats, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static)
 #endif
-gV5_LoopTwo(double, LongDouble, double, Doubles, double)
+gV5_MainLoop(floatD, LongDouble, float, Floats, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopSum(double, LongDouble, double, Doubles, double)
+gV5_SumUp(floatD, LongDouble, float, Floats, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopEnd(double, LongDouble, double, Doubles, double)
-  
+gV5_Sort(floatD, LongDouble, float, Floats, double)
 
 
 
@@ -385,28 +421,27 @@ gV5_LoopEnd(double, LongDouble, double, Doubles, double)
 #undef AVXSTORE
 #undef AVXADD
 #endif
-#define AVXLOAD LOADuFLOAT
-#define AVXSTORE STOREuFLOAT
-#define AVXADD ADDFLOAT
-gV5_start(MY_VARIANT, floatD, LongDouble, float, Floats, double)
+#define AVXLOAD LOADuDOUBLE
+#define AVXSTORE STOREuDOUBLE
+#define AVXADD ADDDOUBLE
+gV5_start(MY_VARIANT, double, LongDouble, double, Doubles, double)
 #ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static) 
+#pragma omp parallel for num_threads(cores) schedule(static)
 #endif
-gV5_LoopOne(floatD, LongDouble, float, Floats, double)
+gV5_CreateHash(double, LongDouble, double, Doubles, double)
 #ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static)
+#pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopTwo(floatD, LongDouble, float, Floats, double)
+gV5_MainLoop(double, LongDouble, double, Doubles, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopSum(floatD, LongDouble, float, Floats, double)
+gV5_SumUp(double, LongDouble, double, Doubles, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopEnd(floatD, LongDouble, float, Floats, double)
-
-
+gV5_Sort(double, LongDouble, double, Doubles, double)
+  
 
 
 #endif
diff --git a/src/miraculix/5codesUint.cc b/src/miraculix/5codesUint.cc
index e53739cb..ba77f3a7 100755
--- a/src/miraculix/5codesUint.cc
+++ b/src/miraculix/5codesUint.cc
@@ -232,19 +232,19 @@ gV5_start(32, longD, LongDouble, LongDouble, LongDouble, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopOne(longD, LongDouble, LongDouble, LongDouble, double)
+gV5_CreateHash(longD, LongDouble, LongDouble, LongDouble, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopTwo(longD, LongDouble, LongDouble, LongDouble, double)
+gV5_MainLoop(longD, LongDouble, LongDouble, LongDouble, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopSum(longD, LongDouble, LongDouble, LongDouble, double)
+gV5_SumUp(longD, LongDouble, LongDouble, LongDouble, double)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopEnd(longD, LongDouble, LongDouble, LongDouble, double)
+gV5_Sort(longD, LongDouble, LongDouble, LongDouble, double)
 
 
 
@@ -252,19 +252,19 @@ gV5_start(32, LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopOne(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
+gV5_CreateHash(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif 
-gV5_LoopTwo(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
+gV5_MainLoop(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopSum(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
+gV5_SumUp(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopEnd(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
+gV5_Sort(LongDouble, LongDouble, LongDouble, LongDouble, LongDouble)
 
 
    
@@ -272,19 +272,19 @@ gV5_start(32, Ulong, Ulong, Ulong, Ulong, Ulong)
 #ifdef DO_PARALLEL 
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif 
-gV5_LoopOne(Ulong, Ulong, Ulong, Ulong, Ulong) 
+gV5_CreateHash(Ulong, Ulong, Ulong, Ulong, Ulong) 
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif 
-gV5_LoopTwo(Ulong, Ulong, Ulong, Ulong, Ulong)
+gV5_MainLoop(Ulong, Ulong, Ulong, Ulong, Ulong)
 #ifdef DO_PARALLEL 
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif 
-gV5_LoopSum(Ulong, Ulong, Ulong, Ulong, Ulong)
+gV5_SumUp(Ulong, Ulong, Ulong, Ulong, Ulong)
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(cores) schedule(static) 
 #endif
-gV5_LoopEnd(Ulong, Ulong, Ulong, Ulong, Ulong)
+gV5_Sort(Ulong, Ulong, Ulong, Ulong, Ulong)
 
 
 
diff --git a/src/miraculix/Files32.cc b/src/miraculix/Files32.cc
index 9e930c62..995a1e42 100755
--- a/src/miraculix/Files32.cc
+++ b/src/miraculix/Files32.cc
@@ -125,11 +125,9 @@ SEXP file_binary(char *file, char *file_coding, int header, bool isSNPxInd,
     matrix_size = matrixLDA * colsLDA,					
     idxIncr = isSNPxInd ? matrixLDA : 1;				
   Long lda = info[LDA];						
-  if ((matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit)) == NULL)
-    ERR0("memory space could not be acquired");
-  char *buffer0 = (char*) CALLOC(blocks * bytesperblock +// OK
-				 MaxByteAlign, 1), 
-    *buffer = (char*)((1L + (uintptr_t) buffer0 / MaxByteAlign)*MaxByteAlign);
+  matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit);
+  char *buffer0 = (char*) CALLOC(blocks * bytesperblock + MaxByteAlign, 1);// OK
+  char *buffer =(char*)((1L + (uintptr_t) buffer0 / MaxByteAlign)*MaxByteAlign);
 
   if ((fp = fopen(file, "r")) == NULL)
     ERR1("file '%.50s' could not be opened", file);
diff --git a/src/miraculix/FilesUint.cc b/src/miraculix/FilesUint.cc
index a7d30f3a..f5d7f255 100755
--- a/src/miraculix/FilesUint.cc
+++ b/src/miraculix/FilesUint.cc
@@ -366,8 +366,7 @@ SEXP file_intern(SEXP file, coding_type coding, int variant,
 
   // printf("\n########### matrixLDA %d %d %d\n", matrixLDA, DeltaMatrix, isHaplo(coding));
   
-  if ((matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit)) == NULL)
-    ERR0("memory space could not be acquired");
+  matrix = (unit_t*) CALLOC(matrix_size, BytesPerUnit);
 
   // jump header lines
   fp = fopen(filename, "r");
diff --git a/src/miraculix/MXinfo.h b/src/miraculix/MXinfo.h
index 08811ac7..cd24297a 100755
--- a/src/miraculix/MXinfo.h
+++ b/src/miraculix/MXinfo.h
@@ -60,6 +60,7 @@ static inline  uintptr_t algn_generalL(int *X, Long LDAbitalign) {
   assert(ROUND_GEQ((uintptr_t) X, f) >= (uintptr_t) X);
   return ROUND_GEQ((uintptr_t) X, f);
 }
+
 static inline void *algn_generalS(SEXP Code, Long LDAbitalign) {
   assert(getAttribPointer(Code, Information) == R_NilValue);
   uintptr_t X = (uintptr_t) INTEGER(Code);
diff --git a/src/miraculix/MoBPS_R.cc b/src/miraculix/MoBPS_R.cc
index faa36823..f5456c77 100755
--- a/src/miraculix/MoBPS_R.cc
+++ b/src/miraculix/MoBPS_R.cc
@@ -650,8 +650,6 @@ SEXP compute(SEXP popul, SEXP Generation, SEXP Sex, SEXP Nr,
     memInUnits = (Long) individuals * ldaHPL,
     mem = calculateAlignedMem(memInUnits, TwoBitHaplo);
   int *G = (int *) CALLOC(mem, BytesPerUnit);
-  if (G == NULL) ERR1("mem allocation (%u bytes)",
-		      (Uint) (mem * BytesPerUnit));
 
   unit_t *g = (unit_t*) algn_generalL(G, MY_LDABITALIGN);
   if ((Uint) LENGTH(Vec) != individuals) ERR0("'vec' not of correct length");
diff --git a/src/miraculix/Template.h b/src/miraculix/Template.h
index 2a6354f2..989c1fe3 100755
--- a/src/miraculix/Template.h
+++ b/src/miraculix/Template.h
@@ -50,14 +50,17 @@ typedef void (*coding_t)(unit_t *, Long, Long, Long, Long, Long,
 
 #define vector_header(TYPE, ENDTYPE)					\
   /* !!! not snps / indivduals for FiveCodesTransposed !!! */		\
-  (unit_t *code, Long rows, Long cols, Long lda,			\
+  (unit_t VARIABLE_IS_NOT_USED  *code,					\
+   Long VARIABLE_IS_NOT_USED rows, Long VARIABLE_IS_NOT_USED cols,	\
+   Long VARIABLE_IS_NOT_USED lda,					\
    coding_type VARIABLE_IS_NOT_USED coding,				\
    int VARIABLE_IS_NOT_USED variant,					\
    LongDouble VARIABLE_IS_NOT_USED * pseudoFreq,			\
-   TYPE *V, Long repetV, Long ldV,					\
+   TYPE VARIABLE_IS_NOT_USED *V, Long VARIABLE_IS_NOT_USED repetV,	\
+   Long VARIABLE_IS_NOT_USED ldV,					\
    basic_options VARIABLE_IS_NOT_USED *opt,				\
    tuning_options VARIABLE_IS_NOT_USED *tuning,				\
-   ENDTYPE *Ans, Long ldAns)
+   ENDTYPE VARIABLE_IS_NOT_USED *Ans, Long VARIABLE_IS_NOT_USED ldAns)
 
 
 typedef void (*vector_double_t) vector_header(double, double);
@@ -121,11 +124,12 @@ typedef void (*gV_vG_Ulong_t) gV_vG_header0(Ulong);
 #define CodesPerLong ((int) (sizeof(Long) * BitsPerByte / BitsPerCode))
 #define sparseTGeno_start(NAME)						\
   sparseTGeno_header(NAME){						\
-  if (nrow_code > (Long)MAXINT || nrow_code < (Long)CodesPerLong ||	\
-      ncol_code > (Long)MAXINT) BUG;					\
+  if (nrow_code > (Long)MAXINT || ncol_code > (Long)MAXINT) BUG;	\
   const basic_options *opt = &(utils->basic);				\
   const int cores = GreaterZero(opt->cores);
  
+// || nrow_code < (Long)CodesPerLong
+
   
 typedef void (*sparseTGeno_t) sparseTGeno_Header;
 		 
diff --git a/src/miraculix/Vector.matrix.Uint.cc b/src/miraculix/Vector.matrix.Uint.cc
index d05d923a..c46d0332 100755
--- a/src/miraculix/Vector.matrix.Uint.cc
+++ b/src/miraculix/Vector.matrix.Uint.cc
@@ -281,7 +281,7 @@ gV_raw(LongDouble)
 void VectorRelMatrix( SEXP SxI, SEXP SxI2, double *V, Long repetV, int compare,
 		     option_type *global, utilsoption_type *utils,
 		     double *ans) {
-  const int debug =  false;
+  const int debug =  !false;
 
   //  if (false) {   printINFO(SxI); BUG; }
  
diff --git a/src/miraculix/compatibility.C.h b/src/miraculix/compatibility.C.h
index f1bbc4bd..5471c959 100755
--- a/src/miraculix/compatibility.C.h
+++ b/src/miraculix/compatibility.C.h
@@ -67,7 +67,7 @@ extern int errno ;
 #define EXP exp
 #define FABS(X) fabs((double) X) // OK; keine Klammern um X!
 #if ! defined MALLOCX
-#define MALLOCX malloc
+#define MALLOCX(X) notNull(malloc(X), __LINE__, __FILE__)
 #define FLOOR floor
 #define SQRT(X) sqrt((double) X) // OK
 #define CEIL(X) ceil((double) X) // OK; keine Klammern um X!
@@ -85,8 +85,8 @@ extern int errno ;
 #define MEMMOVE memmove
 #define MEMSET memset  
 #define MEMCMP memcmp
-#define AALLOC aligned_alloc
-#define CALLOCX calloc
+#define AALLOC(X) notNull(aligned_alloc(X), __LINE__, __FILE__)
+#define CALLOCX(X,N) notNull(calloc(X,N), __LINE__, __FILE__)
 #define SPRINTF sprintf // Rprintf
 #define QSORT qsort
 #define RFERROR(X) {fprintf(stderr, "%s", X); exit(EXIT_FAILURE); }
diff --git a/src/miraculix/compatibility.R.h b/src/miraculix/compatibility.R.h
index 90c11ddd..239e0e4b 100755
--- a/src/miraculix/compatibility.R.h
+++ b/src/miraculix/compatibility.R.h
@@ -44,7 +44,7 @@
 #define EXP std::exp
 #define FABS(X) std::fabs((double) X) // OK; keine Klammern um X!
 #if ! defined MALLOCX
-#define MALLOCX std::malloc
+#define MALLOCX(X) notNull(std::malloc, __LINE__, __FILE__)
 #define FLOOR std::floor
 #define SQRT(X) std::sqrt((double) X) // OK
 #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X!
@@ -62,8 +62,8 @@
 #define MEMMOVE std::memmove
 #define MEMSET std::memset  
 #define MEMCMP std::memcmp
-#define AALLOC std::aligned_alloc
-#define CALLOCX std::calloc
+#define AALLOC notNull(std::aligned_alloc, __LINE__, __FILE__)
+#define CALLOCX notNull(std::calloc, __LINE__, __FILE__)
 #define SPRINTF std::sprintf // Rprint 
 #define QSORT std::qsort
 #define RFERROR error
diff --git a/src/miraculix/compatibility.c.cc b/src/miraculix/compatibility.c.cc
index c45b1b08..7bf89086 100755
--- a/src/miraculix/compatibility.c.cc
+++ b/src/miraculix/compatibility.c.cc
@@ -78,3 +78,10 @@ void stopIfNotAnyIntI(Long i, Long line, const char *file) {
     ERR3("value (%ld) not an integer at line %ld in %s\n", i, line, file);
 }
   
+
+
+void *notNull(void *X, int line, const char *file) {
+  if (X!=NULL) return X;
+  ERR2("No memory was allocated at line %d of %s.\n", line, file);
+  return NULL;
+}
diff --git a/src/miraculix/compatibility.general.h b/src/miraculix/compatibility.general.h
index e0712398..59cf7d35 100755
--- a/src/miraculix/compatibility.general.h
+++ b/src/miraculix/compatibility.general.h
@@ -80,5 +80,6 @@ typedef long double LongDouble;
 #define ASSERT_SOLVE(sp) if (sp == NULL) sp = &(OPTIONS.solve); else {}
 #endif
 
-
+void *notNull(void *X, int line, const char *file);
+  
 #endif
diff --git a/src/miraculix/compatibility.sexp.cc b/src/miraculix/compatibility.sexp.cc
index 845135db..366ce688 100755
--- a/src/miraculix/compatibility.sexp.cc
+++ b/src/miraculix/compatibility.sexp.cc
@@ -135,7 +135,7 @@ SEXP allocObject(int type, Long len, void *x) {
     Ans->x = CALLOC(1, sizeof(void*));
     *((void**) (Ans->x)) = x;
   } else Ans->x = len == 0 ? R_NilValue : CALLOC(Memory(type), len);
-  // printf("don %ld\n", (Long) (Ans->x));
+  
   return Ans;
 }
 
diff --git a/src/miraculix/haplogeno.cc b/src/miraculix/haplogeno.cc
index 07c1c44d..b92e2124 100755
--- a/src/miraculix/haplogeno.cc
+++ b/src/miraculix/haplogeno.cc
@@ -1068,8 +1068,8 @@ void do_centering(Long Snps, Long Individuals, centering_type centered,
   }
   if (transposed) centered = centered == RowMeans ? ColMeans
 		    : centered == ColMeans ? RowMeans : centered;
-  double *sums = (double *) MALLOC(sizeof(double) * cols),
-    factor = 1.0; 
+  double factor = 1.0,
+    *sums = (double *) MALLOC(sizeof(double) * cols);   
 
   switch(centered) {
   case RowMeans:
@@ -1216,7 +1216,6 @@ double *crossprod(unit_t * Code, Long snps, Long individuals,  Long lda,
 		  basic_options *opt, tuning_options *tuning) {
   // never squared here!!!
   double *A = (double *) MALLOC(individuals * individuals * sizeof(double));
-  if (A == NULL) ERR0("mem allocation");
 
   crossprodI(Code, snps, individuals, lda, coding, variant,
 	     centered, normalized, squared, Precision,
@@ -1373,7 +1372,6 @@ void allele_sum_I(SEXP SxI,
     FREE(E);
   }
 
-  assert(sum != NULL);
   if (EightByte) {
     for (Long i=0; i<rows; i++) ans[i] = sum[2 * i] + sum[2*i + 1];
     FREE(sum);
@@ -1703,7 +1701,7 @@ void sparseTGeno(Uchar *code,
   if (tSparse) BUG;
   sparseTGeno_t sTG = NULL;
   switch(coding) {
-  case Plink : case OrigPlink : sTG = sparseTGenoOrigPlink; break;    
+  case Plink : case OrigPlink : sTG = sparseTGenoPlink; break;    
   case TwoBitGeno : sTG = sparseTGeno2Bit; break;
   case OneByteGeno : sTG = sparseTGenoOneByte; break;
   default : BUG;
diff --git a/src/miraculix/intrinsics.h b/src/miraculix/intrinsics.h
index 68517b62..6ad2ecdc 100755
--- a/src/miraculix/intrinsics.h
+++ b/src/miraculix/intrinsics.h
@@ -152,6 +152,7 @@ union uni512 {
 #define BlockType __m512i ALIGNED
 #define UnionType0 uni512
 #define Doubles __m512d
+#define Floats __m512
 
 #define LOADuDOUBLE _mm512_loadu_pd
 #define LOADuFLOAT _mm512_loadu_ps
@@ -218,7 +219,7 @@ union uni512 {
   #define ADD8 _mm512_add_epi8
   #define SAD8  _mm512_sad_epu8
   #define SHUFFLE8 _mm512_shuffle_epi8
-#elif
+#else
   #define LOWER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 0)
   #define UPPER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 1)
   #define DO_256(X, A, B) \
@@ -227,7 +228,7 @@ union uni512 {
 
   #define ADD8(A, B) DO_256(_mm256_add_epi8, A, B)		
   #define SAD8(A, B) DO_256(_mm256_sad_epu8, A, B)
-  #define SHUFFLE8 DO_256(_mm256_shuffle_epi8, A, B)
+  #define SHUFFLE8(A, B) DO_256(_mm256_shuffle_epi8, A, B) 
 #endif
 
 
diff --git a/src/miraculix/intrinsicsIntern.h b/src/miraculix/intrinsicsIntern.h
index a686da1f..e5735f3b 100755
--- a/src/miraculix/intrinsicsIntern.h
+++ b/src/miraculix/intrinsicsIntern.h
@@ -81,8 +81,8 @@ POPCNT_EMUL64(BlockType0 v) { // sizeof(Long)=64
 #if defined AVX512
 
 static inline Ulong HORIZ_ADD64(BlockType0 A) {
-  uni256 z = _mm256_add_epi64(_mm512_extractf64x4_epi64(A, 1),
-			      _mm512_extractf64x4_epi64(A, 0));
+ __m256i z = _mm256_add_epi64(_mm512_extracti64x4_epi64(A, 1),
+			      _mm512_extracti64x4_epi64(A, 0));
   uni128 x;
   x.vi = _mm_add_epi64(_mm256_extracti128_si256(z, 1),
 		       _mm256_extracti128_si256(z, 0));
diff --git a/src/miraculix/kleinkram.cc b/src/miraculix/kleinkram.cc
index b794ad95..1a490938 100755
--- a/src/miraculix/kleinkram.cc
+++ b/src/miraculix/kleinkram.cc
@@ -383,7 +383,6 @@ void XCXt(double *X, double *C, double *V, Long nrow, Long dim /* dim of C */, i
   double  
     *endpX = X + nrow,
     *tmp = (double*) MALLOC(sizeof(double) * size); // tmp = XC
-  if (tmp == NULL) RFERROR("XCXt: memory allocation error in XCXt");
  
 #ifdef DO_PARALLEL
 #pragma omp parallel for num_threads(GreaterZero(cores))  schedule(static)
diff --git a/src/miraculix/main.cc b/src/miraculix/main.cc
index c04d3668..bdd7418e 100755
--- a/src/miraculix/main.cc
+++ b/src/miraculix/main.cc
@@ -38,16 +38,30 @@
 void get_started(option_type **g, utilsoption_type **u);
 
 
-// make standalone && Wageningen/run_gcc snps=2400 ind=3800 cores=4 coding=4 mode=3 SNPmode=0 center=0 repetV=16 trials=1 variant=257 cmp=10
+
+// SPARSE
+// make standalone && make standalone && Wageningen/run_gcc snps=11200 ind=12300 cores=10 coding=4 mode=3 SNPmode=3 repetV=32 trials=10 centered=0 variant=256 sparse=100000 cmp=1 rowMode=2 colMode=2 fillMode=2
+// L 361, plinkUint pr a gma for
+//   without static:  18/100
+//   with static: 18/100
+//   with static ordered: 19/110
+
+// make standalone && make standalone && Wageningen/run_gcc snps=112000 ind=123000 cores=20 coding=4 mode=3 SNPmode=3 repetV=32 trials=10 centered=0 variant=256 sparse=1 cmp=1  ## katastrophal schlecht bei cores=20 -- mit cores=1 viel besser!
+
+// make standalone && make standalone && Wageningen/run_gcc snps=112000 ind=123000 cores=20 coding=4 mode=3 SNPmode=3 repetV=32 trials=10 centered=0 variant=256 sparse=1 cmp=1 ## katastrophal schlecht bei cores=20
 
 
 
-//make standalone && Wageningen/run_gcc snps=128000 ind=136960 cores=10 coding=5   mode=3 SNPmode=3 repetV=32 trials=1 centered=0 variant=257
+// make standalone && Wageningen/run_gcc snps=5024 ind=4100 cores=5 coding=5 mode=3 SNPmode=3 repetV=2 trials=1 cmp=9
+
+//make standalone && Wageningen/run_gcc snps=128000 ind=136960 cores=10 coding=5   mode=3 SNPmode=3 repetV=32 trials=1 centered=0 variant=256
 
 
 // make standalone && Wageningen/run_gcc snps=12800 ind=13800 cores=4 coding=4 mode=3 SNPmode=3 center=0 repetV=16 trials=1 variant=257
 
 
+// make standalone && Wageningen/run_gcc snps=2400 ind=3800 cores=4 coding=4 mode=3 SNPmode=0 center=0 repetV=16 trials=1 variant=257 cmp=10
+
 
 
 // touch miraculix/src/5codesUint.cc && make standalone && Wageningen/run_gcc snps=150000 ind=150000 cores=6 coding=5 mode=3 SNPmode=3 repetV=100 trials=3 centered=0 variant=32
@@ -67,7 +81,7 @@ void get_started(option_type **g, utilsoption_type **u);
 
 
 
-// make standalone && Wageningen/run_gcc snps=50241 ind=410000 cores=5 coding=5 mode=3 SNPmode=3 repetV=1 trials=5
+// make standalone && Wageningen/run_gcc snps=50241 ind=410000 cores=5 coding=5 mode=3 SNPmode=3 repetV=1 trials=5 
 
 
 // make standalone && Wageningen/run_gcc snps=50241 ind=410000 cores=6 coding=5 mode=3 SNPmode=3 repetV=1 trials=1 variant=128
@@ -79,7 +93,7 @@ void get_started(option_type **g, utilsoption_type **u);
 
 
 
-//  make standalone && Wageningen/run snps=102400 ind=51200 cores=2 SNPmode=5 mode=1 coding=2 cmp=0
+//  make standalone && Wageningen/run_gcc snps=102400 ind=51200 cores=2 SNPmode=5 mode=1 coding=2 cmp=0
 
 
 //  make standalone && Wageningen/run snps=262144 ind=512 variant=32 mode=1
@@ -328,7 +342,7 @@ void RandomSparse(int nIdx, int max_ncol,
       colIdxB[i]= i % 2 == 0 ? max_ncol - 1 : i % 3 == 0 ? 0 : i % max_ncol;
   }
   
-  double *B = *valueB = (double*) MALLOC(sizeof(**valueB) * total);  
+  double *B = *valueB = (double*) MALLOC(sizeof(**valueB) * total);
   switch (fillMode) {
   case 0 : for (int i=0; i<total; i++) B[i] = (double) (i + 1);
     break;
@@ -445,7 +459,7 @@ int main(int argc, char *argv[]) {
   int colMode = (int) argdefault[z++];
    int fillMode = (int) argdefault[z++];
  
-  opt->efficient = false;
+  opt->efficient = variant >= 512;
   CLOCK("starting");
   
   Long size = indiv * repetV;
@@ -559,65 +573,65 @@ int main(int argc, char *argv[]) {
       CLOCK("transforming to OrigPlink");
       UNPROTECT(1);
     }
-    assert(code != NULL);
 
     int p_nrow = sparse_nrow < 10 ? sparse_nrow : 10;
     int p_ncol = snps < 20 ? (int) snps : 20;
     int p_indiv = indiv < 10 ? (int) indiv : 10;
 
-    for (int k=0; k<30; k++)
+#define repetSparse 10
+    for (int k=0; k<repetSparse; k++)
       sparseTGeno(code, snps, indiv,
 		  true_coding == OrigPlink ? DIV_GEQ(snps,4) : infoCoded[LDA],
 		  true_coding == OrigPlink ? OrigPlink
 		                   : (coding_type) infoCoded[CODING],
 		  valueB, sparse_nrow, rowIdxB, colIdxB, false, global, utils,
 		  Ans, sparse_nrow);
-    CLOCK("Sparse multiplication");
+    CLOCK1("%d Sparse multiplications", repetSparse);
     //    BUG;
 
    
-   if (cmp && SxI != R_NilValue) {
+    if (cmp && SxI != R_NilValue) {
       Long lda = info[LDA];
       codeSxI = (Uchar*) Align(SxI, opt);
       AnsSxI = (double*) MALLOC(AnsBytes);
       sparseTGeno(codeSxI, snps, indiv, lda, (coding_type) info[CODING],
 		  valueB, sparse_nrow, rowIdxB, colIdxB, false, global, utils,
 		  AnsSxI, sparse_nrow);
-    CLOCK("Sparse multiplication using OneByte");
-   
-	if (false) {
-      for (Long j=0; j<p_ncol; j++) {
-	for (Long i=0; i<p_indiv; i++) {
-	  PRINTF("%u\t", (Uint) codeSxI[j + i * lda * sizeof(unit_t)]);
+      CLOCK("Sparse multiplication using OneByte");
+      
+      if (false) {
+	for (Long j=0; j<p_ncol; j++) {
+	  for (Long i=0; i<p_indiv; i++) {
+	    PRINTF("%u\t", (Uint) codeSxI[j + i * lda * sizeof(unit_t)]);
+	    }
+	  PRINTF("\n");
 	}
 	PRINTF("\n");
       }
-	PRINTF("\n");
-   }
-
-	if (false) {
-	  for (Long j=0; j<p_nrow; j++) {
+      
+      if (false) {
+	for (Long j=0; j<p_nrow; j++) {
 	PRINTF("%ld ", j);	
 	for (Long i=rowIdxB[j]; i<rowIdxB[j+1]; i++) {
 	  PRINTF("%d:%3.1f ", colIdxB[i], valueB[i]);
 	}
 	PRINTF("\n");
-      }
-       printf("\n");
-	}
-
-       if (false)
-      for (int k=0; k<p_nrow; k++) {
-	for (int i=0; i<p_ncol; i++) {
-	  printf("%6.1f", AnsSxI[i * sparse_nrow + k]);
-	  if (Ans[i * sparse_nrow + k] ==  AnsSxI[i * sparse_nrow + k])
-	    printf("*   ");
-	  else printf(":%3.1f ", Ans[i * sparse_nrow + k]);
-	  PRINTF("\t");
 	}
 	printf("\n");
       }
       
+      if (false)
+	for (int k=0; k<p_nrow; k++) {
+	  for (int i=0; i<p_ncol; i++) {
+	    printf("%6.1f", AnsSxI[i * sparse_nrow + k]);
+	    if (Ans[i * sparse_nrow + k] ==  AnsSxI[i * sparse_nrow + k])
+	      printf("*   ");
+	    else printf(":%3.1f ", Ans[i * sparse_nrow + k]);
+	    PRINTF("\t");
+	  }
+	  printf("\n");
+	}
+      
 
       for (int k=0; k<sparse_nrow; k++) {
 	bool first = true;
@@ -630,10 +644,10 @@ int main(int argc, char *argv[]) {
 	  }
 	}
       }    	
-   }
-
+    }
+    
 
-   FREE(valueB);
+    FREE(valueB);
     FREE(rowIdxB);
     FREE(colIdxB);
     FREE(Ans);
@@ -685,6 +699,9 @@ int main(int argc, char *argv[]) {
     CLOCK("comparing...");
   }
 
+
+
+  
   CLOCK1("BEFORE STARTING TRIALS = %d\n", trials); 
   infoCoded[VARIANT] = variant;
   //
diff --git a/src/miraculix/makefile.c.mk b/src/miraculix/makefile.c.mk
index f94487cd..2d082e17 100755
--- a/src/miraculix/makefile.c.mk
+++ b/src/miraculix/makefile.c.mk
@@ -17,8 +17,8 @@ SSE41_FLAG=-msse4.1
 AVX_FLAG=-mavx
 #AVX2_FLAG=-mavx2 
 AVX2_FLAG=-mavx2
-AVX512_FLAG=-mavx512 
-MAX_AVX512_FLAG=-mavx512
+AVX512_FLAG=-mavx512f
+MAX_AVX512_FLAG=-mavx512f
 LINKER=-std=gnu++14 -L/usr/local/lib -L/usr/lib64
 
 else ifeq ($(COMPILER), intel)
@@ -124,6 +124,9 @@ intrinsics.o: $(SRC)/intrinsics.cc
 %avx.o: $(SRC)/%avx.cc
 	$(CC) -c $< -o $@ $(CCFLAGS) $(AVX_FLAG)
 
+#%512.o: $(SRC)/%512.cc
+#	$(CC) -c $< -o $@ $(CCFLAGS) $(AVX512_FLAG) 
+
 %256.o: $(SRC)/%256.cc
 	$(CC) -c $< -o $@ $(CCFLAGS) $(AVX2_FLAG)
 
diff --git a/src/miraculix/makefile.miraculix.mk~ b/src/miraculix/makefile.miraculix.mk~
deleted file mode 100755
index edab6749..00000000
--- a/src/miraculix/makefile.miraculix.mk~
+++ /dev/null
@@ -1,22 +0,0 @@
-##################################################
-## miraculix
-##################################################
-
-MY_SSE2=
-MY_SSE3=
-MY_SSSE3=1bit128.o 2bit128.o
-MY_SSE41=5codes128_41.o
-MY_AVX=
-MY_AVX2=5codes256.o  Scalar256.o 1bit256.o  2bit256.o
-MY_AVX512F=1bit512.o 2bit512.o 
-MY_CU_FILES=mma_75.o mma_61.o solve_61.o
-MY_MAX_AVX512F=intrinsics.o
-
-MY_CC_FILES=plinkUint.o 5codesChar.o 5codesUint.o kleinkram.o FilesUint.o Files32.o transformUint.o Haplo2_Uint.o HaploUint.o OneByteUint.o OneByte32.o OneByte64.o extern.o  haplogeno.o  mmagpuDummy.o 4ByteUint.o Automiraculix.o options.o  1bitUint.o 1bit32.o  1bit64.o 2bitUint.o 2bit32.o 2bit64.o  3bitUint.o 3bit64.o Scalar.o MX.o  xport_import.o compatibility.c.o compatibility.sexp.o  Vector.matrix.Uint.o Vector.matrix.LD.o Vector.matrix.D.o 
-
-MY_CC_FOR_R_FILES=transformR.o kleinkram.R.o HaploR.o haplogeno.R.o Vector.matrix.R.o options.R.o ScalarR.o utils_miraculix.R.o windower.R.o MoBPS_R.o xport_import.R.o scan.R.o
-
-MY_CC_DISTRIBUTED=
-MY_C_FILES=5codesAPI.o zzzR.o 
-
-MY_FORTRAN_FILES=
\ No newline at end of file
diff --git a/src/miraculix/makefile.rfu.mk b/src/miraculix/makefile.rfu.mk
index a2d56c22..09bf84c9 100755
--- a/src/miraculix/makefile.rfu.mk
+++ b/src/miraculix/makefile.rfu.mk
@@ -13,7 +13,7 @@ MY_AVX2=rfu_avx2.o
 MY_AVX512F=
 MY_CU_FILES=solve_RFU_61.o gpu_info_61.o
 
-MY_CC_FILES=AutoRandomFieldsUtils.o  extern_rfu.o maths.o options_rfu.o solve_rfu.o sort.o sortLong.o utils.o win_linux_aux.o xport_import_rfu.o    
+MY_CC_FILES=AutoRandomFieldsUtils.o  extern_rfu.o maths.o options_rrfu.o solve_rrfu.o sort.o sortLong.o utils.o win_linux_aux.o xport_import_rrfu.o    
 
 MY_CC_FOR_R_FILES=brdomain_R.o kleinkram.R.o maths.R.o options_RFU.o RFoptions_RFU.o solve_RFU.o sort.R.o utils.R.o xport_import_RFU.o gpu_info_R.o 
 
diff --git a/src/miraculix/makefile.rfu.mk~ b/src/miraculix/makefile.rfu.mk~
deleted file mode 100755
index 11629a71..00000000
--- a/src/miraculix/makefile.rfu.mk~
+++ /dev/null
@@ -1,24 +0,0 @@
-##################################################
-## RandomFieldsUtils
-##################################################
-
-##CROSS=
-
-MY_SSE2=
-MY_SSE3=
-MY_SSSE3=
-MY_SSE41=
-MY_AVX=rfu_avx.o
-MY_AVX2=rfu_avx2.o
-MY_AVX512F=
-MY_CU_FILES=solve_RFU_61.o gpu_info_61.o
-
-MY_CC_FILES=AutoRandomFieldsUtils.o  extern_rfu.o maths.o options_rfu.o solve_rfu.o sort.o sortLong.o utils.o win_linux_aux.o xport_import_rfu.o    
-
-MY_CC_FOR_R_FILES=brdomain_R.o kleinkram.R.o maths.R.o options_RFU.o RFoptions_RFU.o solve_RFU.o sort.R.o utils.R.o xport_import_RFU.o gpu_info_R.o 
-
-MY_CC_DISTRIBUTED=kleinkram.o compatibility.c.o compatibility.sexp.o main.o
-MY_C_FILES=zzz_RFU.o  ## unused
-
-#MY_FORTRAN_FILES=bckslvmodified.f cholmodified.f spamown.f 
-
diff --git a/src/miraculix/options_rrfu.cc b/src/miraculix/options_rrfu.cc
new file mode 100755
index 00000000..2cf3ddf7
--- /dev/null
+++ b/src/miraculix/options_rrfu.cc
@@ -0,0 +1,284 @@
+/*
+ Authors 
+ Martin Schlather, martin.schlather@uni-mannheim.de
+
+ Copyright (C) 2022-2023 Martin Schlather 
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+
+#include <unistd.h>
+#include "parallel_simd.h"
+#ifdef TIME_AVAILABLE
+#include <time.h>
+#endif
+
+
+
+
+#include "Basic_RFUlocal.h" // must be before anything else
+#include "kleinkram.h"
+#include "zzz_RFU.h"
+#include "options_RFU.h"
+#include "extern_RFU.h"
+#if defined compatibility_to_R_h     
+#include "xport_import_RFU.h"
+#endif
+
+// IMPORTANT: all names of general must have at least 3 letters !!!
+const char *basic[basicN] =
+  { "printlevel","cPrintlevel", "seed", "cores",
+    "skipchecks", "asList", "verbose", "helpinfo", "efficient",
+    "bigendian","warn_parallel", "exactness"
+  };
+
+const char *installNrun[installNrunN] =
+  { "kahanCorrection", "warn_unknown_option", "la_mode", 
+    "install","installPackages", "determineLAmode", "mem_is_aligned",
+    "gpuDevices", "maxStreams"
+  };
+
+const char * solve[solveN] = 
+  { "use_spam", "spam_tol", "spam_min_p", "svdtol", "eigen2zero",
+    "solve_method", "spam_min_n", "spam_sample_n", "spam_factor", "spam_pivot",
+    "max_chol", "max_svd", "pivot",
+    "pivot_idx", // dynamic parameter
+    "pivot_relerror", "pivot_max_deviation", "pivot_max_reldeviation",
+    "det_as_log", "pivot_actual_size", "pivot_check", "pseudoinverse",
+    "AtAmode", "AtAnrow", "AtAncol", "Strassen_min", "Strassen_factor"
+    //, "tmp_delete"
+  };
+
+
+const char * prefixlist[prefixN] = {"basic", "installNrun", "solve"};
+const char **allOptions[prefixN] = {basic, installNrun, solve};
+int allOptionsN[prefixN] = {basicN, installNrunN, solveN};
+
+
+utilsoption_type OPTIONS = { // OK
+  basic_START,
+  installNrun_START,
+  { solve_START },
+  dummy_START
+};
+
+
+
+
+int doPosDefIntern(double *M0, int size, bool posdef,
+		   double *rhs0, Long rhs_cols, double *result,
+		   double *logdet, int calculate, solve_storage *Pt,
+		   solve_options *sp, int VARIABLE_IS_NOT_USED cores);
+
+void params_utilsoption(int local, int *params) {
+  utilsoption_type *from = WhichOptionList(local);
+  params[PIVOT_IDX_N] = from->solve.n_pivot_idx;
+}
+
+void get_utils_basic(basic_options *opt, int local) {
+  utilsoption_type *from = WhichOptionList(local);
+  MEMCOPY(opt, from, sizeof(basic_options));
+}
+
+void get_utilsoption(utilsoption_type *S, int local) {
+  assert(solveN == 21 && basicN == 9 && installNrunN == 10 && prefixN==3);
+  utilsoption_type *from = WhichOptionList(local);
+  assert(from->solve.n_pivot_idx!=0 xor from->solve.pivot_idx == NULL);
+  assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL);
+  if (S->solve.n_pivot_idx != from->solve.n_pivot_idx) BUG;
+  int *save_idx = S->solve.pivot_idx,
+    n = S->solve.n_pivot_idx < from->solve.n_pivot_idx ?
+    S->solve.n_pivot_idx : from->solve.n_pivot_idx;
+  MEMCOPY(S, from, sizeof(utilsoption_type)); // OK
+  S->solve.pivot_idx = save_idx;
+  if (n > 0) {
+    MEMCOPY(S->solve.pivot_idx, from->solve.pivot_idx,
+	    sizeof(*(S->solve.pivot_idx)) * n);
+  }
+}
+
+void push_utilsoption(utilsoption_type *S, int local) {
+  utilsoption_type *to = WhichOptionList(local);
+  assert(to->solve.n_pivot_idx!=0 xor to->solve.pivot_idx == NULL);
+  assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL);
+  int *save_idx = to->solve.pivot_idx;
+  if (to->solve.n_pivot_idx != S->solve.n_pivot_idx) {
+    FREE(to->solve.pivot_idx);
+    to->solve.pivot_idx = (int*) MALLOC(S->solve.n_pivot_idx * sizeof(int));
+    save_idx = to->solve.pivot_idx;
+  }
+  MEMCOPY(to, S, sizeof(utilsoption_type)); // OK
+  to->solve.pivot_idx = save_idx;
+  if (S->solve.n_pivot_idx > 0) {
+    MEMCOPY(to->solve.pivot_idx, S->solve.pivot_idx,
+	    sizeof(*(to->solve.pivot_idx)) * S->solve.n_pivot_idx);
+  }
+}
+
+void del_utilsoption(utilsoption_type *S) {
+  FREE(S->solve.pivot_idx);
+  S->solve.n_pivot_idx = 0;
+}
+
+void update_utilsoption() {
+  push_utilsoption(&OPTIONS, true);
+}
+
+#define FASTER 1.3 // 1.3 as for typical application of likelihood,
+// the determinant calculation in RandomFieldsUtils is for free. Somehow a
+// balance
+int own_chol_up_to(int size, int maxtime, int VARIABLE_IS_NOT_USED cores) {
+#ifdef TIME_AVAILABLE
+  if (size <= 0) return true;
+  Long delta[2];
+  solve_options sp;
+  solve_storage pt;
+  solve_NULL(&pt);
+  MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); 
+  sp.Methods[0] = Cholesky;					     
+  sp.Methods[1] = NoFurtherInversionMethod;
+  sp.pivot_mode = PIVOT_NONE;	  
+  sp.sparse = False;	  
+  double old_quotient = RF_NAN;
+  // basic assumption is that R implementation's getting faster
+  // for larger matrices
+  //  printf("**** start\n");
+  while (true) {
+    //    printf("x \n");
+    int sizeP1 = size + 1,
+      sizesq = size * size,
+      loops = size > 64 ? 1 : 16384 / ((size + 8) * (size+8)) / 4;
+    double *M = (double*) MALLOC(sizesq * sizeof(double));
+    for (int j=0; j<=1; j++) {
+      //      printf("%d,%d\n", j, loops);
+      SetLaMode(j == 0 ? LA_INTERN : LA_R, cores);
+      clock_t start = clock();
+      for (int k=0; k<loops; k++) {      
+	MEMSET(M, 0, sizesq * sizeof(*M));
+	for (int i=0; i<sizesq; i+=sizeP1) M[i] = 1.0;
+	if (size > 1) M[1] = M[size] = 1e-5;
+	//printf("size=%d\n", size);
+	doPosDefIntern(M, size, true, NULL, 0, NULL, NULL, MATRIXSQRT, &pt, &sp,
+		       cores);
+	//printf("doen\n");
+      }
+      delta[j] = (Long) clock() - start; 
+      if (delta[j] < 0) delta[j] += MAXINT; // manual: 32bit repres. of clock
+    }
+    FREE(M);
+    if (PL > 2)
+      PRINTF("Cholesky decomposition for a %d x %d matrix needs %ld and %ld [mu s] on R and facile code on %d cores (#%d), respectively.\n", size, size, delta[1], delta[0], cores, loops);
+
+    //  printf("delta %d %d %d\n", delta[0], delta[1], maxtime);
+    
+    if (delta[0] > maxtime || delta[1] > maxtime ||
+	(double) delta[0] >= FASTER * (double) delta[1]){
+      solve_DELETE0(&pt);
+      if ((maxtime > 0 &&
+	   (delta[0] > 10 * maxtime || delta[1] > 10 * maxtime)) ||
+	  delta[0] > 2 * delta[1] || delta[1] > 2 * delta[0]) {
+	// seems to be time consuming. So stop.
+	return (double) delta[0] < FASTER * (double) delta[1]
+	  ? MAXINT : (size <= 0 ? 0 : size / 2);
+      }
+      break;
+    }
+    old_quotient = (double) delta[0] / (double) delta[1];
+    size *= 2;
+  }
+  double new_quotient = (double) delta[0] / (double) delta[1];
+  if (new_quotient < FASTER) return MAXINT;
+  if (size <= 0) return(0);
+  if (!R_FINITE(old_quotient)) {
+    // printf("halfsize\n");
+    int compare = own_chol_up_to(size / 2, 0, cores);
+   return compare == MAXINT ? size : compare;
+  }
+  double x0 = 0.5 * size * (1.0 + (FASTER - old_quotient) /
+			    (new_quotient - old_quotient)); //lin interpol
+  assert(x0 >= 0.5 * size && x0 <= size);
+  int compare = own_chol_up_to((int) x0, 0, cores);
+  //  printf("%f %f %f %f %d %d\n", x0,FASTER,  old_quotient, new_quotient, size, compare);
+  return (int) (compare == MAXINT ?  x0 : 0.5 * size);
+#else 
+  ERR0("option 'LA_AUTO' is available only on linux systems");
+  return 0;
+#endif
+}
+int own_chol_up_to(int VARIABLE_IS_NOT_USED cores) {
+  own_chol_up_to(256, 0, cores); //warm up for some BLAS implementatioan
+  //  own_chol_up_to(256, 50000);
+  //  own_chol_up_to(8, 50000);
+  return own_chol_up_to(256, 50000, cores);
+}
+
+
+  
+void SetLaMode(la_modes usr_mode, int VARIABLE_IS_NOT_USED cores) {
+  utilsoption_type *utils = &OPTIONS;
+  la_modes la_mode = usr_mode;
+  utils->solve.tinysize =
+    utils->installNrun.LaMaxTakeIntern = -1;
+#define TINY_SIZE_MAX 3  
+  if (la_mode == LA_INTERN) {
+    utils->solve.tinysize = TINY_SIZE_MAX;
+    utils->installNrun.LaMaxTakeIntern = MAXINT;
+  } else if (la_mode == LA_AUTO) {  
+    la_mode = HAS_GPU ? LA_GPU : LA_R ;
+#if defined TIME_AVAILABLE
+#  ifdef SCHLATHERS_MACHINE
+#else    
+    int PLalt = PL;
+    PL = 0;
+#  endif  
+    utils->installNrun.LaMaxTakeIntern = own_chol_up_to(cores);
+    utils->solve.tinysize = MIN(TINY_SIZE_MAX, utils->installNrun.LaMaxTakeIntern);
+    if (PL > 0)
+      PRINTF("Limit size for facile Cholesky algorithm  = %d\n",
+	     utils->installNrun.LaMaxTakeIntern);
+#  ifdef SCHLATHERS_MACHINE
+#else   
+    PL = PLalt;
+#  endif
+#endif
+  }
+  
+  if ((la_mode == LA_GPU || la_mode == LA_R) &&
+      utils->solve.pivot_mode > PIVOT_AUTO)
+    ERR0("Pivotized Cholesky decomposition has not been implemented yet for GPU and the LAPACK library");
+  
+  utils->installNrun.la_mode = la_mode;
+}
+void SetLaMode() {
+  utilsoption_type *utils = &OPTIONS;
+  SetLaMode(utils->installNrun.la_usr,
+	    GreaterZero(utils->basic.cores));
+}
+
+
+
+
+
+utilsoption_type *WhichOptionList(bool local) {  
+  if (local) {
+ #if defined compatibility_to_R_h     
+    KEY_type *KT = KEYT();
+    if (KT == NULL) BUG;
+    return &(KT->global_utils);
+#else
+    ERR0("'local' must be 'false' in standalone packages.")
+#endif
+  }
+  return &OPTIONS;
+}
diff --git a/src/miraculix/parallel_simd.h b/src/miraculix/parallel_simd.h
index 771cae6e..70c05090 100755
--- a/src/miraculix/parallel_simd.h
+++ b/src/miraculix/parallel_simd.h
@@ -18,12 +18,11 @@
 */
 
 
-
 #ifndef parallel_omp__H
 #define parallel_omp__H 1
 
 // #define NEVER_OMP 1
-#define NEVER_AVX512 1
+#define DO_AVX512F 1
 // #define NEVER_AVX 1
 // #define NEVER_SSE 1
 
diff --git a/src/miraculix/plink.h b/src/miraculix/plink.h
index 980e736f..58503779 100755
--- a/src/miraculix/plink.h
+++ b/src/miraculix/plink.h
@@ -36,7 +36,7 @@ vectorGeno_header(double, double, double, Plink);
 vectorGeno_header(LongDouble, LongDouble, LongDouble, Plink);
 sumGeno_header(Plink);
 
-sparseTGeno_header(OrigPlink);
+sparseTGeno_header(Plink);
 vectorGeno_header(double, double, double, PlinkMatrix256);
 
 void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt);
diff --git a/src/miraculix/plink256.cc b/src/miraculix/plink256.cc
index ecc75c54..ed54ee31 100755
--- a/src/miraculix/plink256.cc
+++ b/src/miraculix/plink256.cc
@@ -62,7 +62,7 @@ IMPORTANT FUNCTIONS IN 2bit256.cc
 
 
 vectorGeno_header(double, double, double, PlinkMatrix256) {
-  // calculates  v * t(geno)  for some geno matrix given by "code"
+  // calculates  t(v) * geno  for some geno matrix given by "code"
   // In :    V : snps x repetV
   //      code : snps x indiv
   // Out:  Ans : indiv x repetV
@@ -166,8 +166,6 @@ vectorGeno_header(double, double, double, PlinkMatrix256) {
   //  printf("bytesTV = %ld %ld\n", bytesTV, bytesTans);
   Doubles *TV = (Doubles*) MALLOC(bytesTV);
   Doubles *Tans = (Doubles*) MALLOC(bytesTans);
-  assert(TV != NULL);
-  assert(Tans != NULL);
  
 
 #define miniIndiv1 8
diff --git a/src/miraculix/plinkUint.cc b/src/miraculix/plinkUint.cc
index 93ed51ee..4e915d27 100755
--- a/src/miraculix/plinkUint.cc
+++ b/src/miraculix/plinkUint.cc
@@ -313,7 +313,7 @@ void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt) {
   if (snp##N + f + 1 <= snpsEnd) snps##N = snp##N[f];		\
   else {					\
     Uchar *S = (Uchar*) (snp##N + f);					\
-    Uchar TMP[TMP_LEN] = {						\
+    Uchar TMP[TMP_LEN] = { /* LittleEndian! */				\
       S + 0 < (Uchar*) snpsEnd ? S[0] : (Uchar) 0,			\
       S + 1 < (Uchar*) snpsEnd ? S[1] : (Uchar) 0,			\
       S + 2 < (Uchar*) snpsEnd ? S[2] : (Uchar) 0,			\
@@ -349,7 +349,8 @@ void getPlinkMissings(Uchar* plink, SEXP SxI, basic_options *opt) {
   iChunk+=miniIndivChunk;					\
   nonzeros-=miniIndivChunk
 
-sparseTGeno_start(OrigPlink)
+sparseTGeno_start(Plink)
+  // sparse * t(code)
   // NOTE: for better readability only, redefine nrow & ncol by their 
   //       typical meaning in (sparse) %*% t(code), namely snps and individuals
   const Long snps = nrow_code;       
@@ -367,7 +368,7 @@ sparseTGeno_start(OrigPlink)
     double *ans = Ans + iRchunk * ldAns;
     assert(sizeof(Ulong) == TMP_LEN);
 #ifdef DO_PARALLEL
-#pragma omp parallel for num_threads(cores) schedule(static,staticSize) ordered 
+#pragma omp parallel for num_threads(cores) // schedule(static) // schedule(static,staticSize) ordered
 #endif 
     for (int j=0; j<nIdx; j++) {
       int nonzeros = rowIdxB[j+1] - rowIdxB[j]; // CSR/CSR/Yale
diff --git a/src/miraculix/solve_rrfu.cc b/src/miraculix/solve_rrfu.cc
new file mode 100755
index 00000000..7cfb6ca1
--- /dev/null
+++ b/src/miraculix/solve_rrfu.cc
@@ -0,0 +1,2223 @@
+
+/*
+ Authors 
+ Martin Schlather, martin.schlather@uni-mannheim.de
+
+ Copyright (C) 2015 - 2017 Martin Schlather, Reinhard Furrer, Martin Kroll
+ Copyright (C) 2017 - 2020 Martin Schlather
+ Copyright (C) 2021 - 2022 Martin Schlather, Alexander Freudenberg
+ Copyright (C) 2022-2023 Martin Schlather 
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+
+#include "Basic_RFUlocal.h"
+#include "compatibility.lapack.h"
+#include "extern_RFU.h"
+#ifdef USEGPU
+#include "solve_gpu.h"
+#else
+SIMD_MISS(solve_61, gpu);
+#endif
+
+#if ! defined FCONE // only as RFU has errror that R V 3x is used
+  #if defined __GCC__
+    #warning FCONE not available
+  #endif 
+  #define FCONE
+#endif
+#ifndef LOCAL_ERRORSTRING
+#define LOCAL_ERRORSTRING
+#endif
+#ifdef WHICH_ERRORSTRING
+#undef WHICH_ERRORSTRING
+#define WHICH_ERRORSTRING pt->err_msg
+#endif
+#include "kleinkram.h"
+#include "zzz_RFU.h"
+
+//#include "xport_import_RFU.h"
+extern const char * solve[solveN];
+#define SCALAR(A,B,C) scalarX(A,B,C,NR)
+#define LINEAR(A,B,C,D) linearX(A,B,C,D, linear_avx)
+
+
+AVAILABLE_SIMD
+
+
+
+const char * InversionNames[nr_InversionMethods] = {
+  "cholesky", "svd", "eigen", "sparse",
+  "method undefined",
+  "qr", "lu", 
+  "no method left",
+  "GPU-cholesky",
+  "R chol implementation",
+  "direct formula",
+  "diagonal"};
+		
+
+#define KAHAN false // O P TIONS.installNrun.kahanCorrection
+
+#define C_MALLOC(INT, WHICH, N, TYPE)	{		\
+    INT _N_ = N;							\
+    if (pt->n_##WHICH < _N_) {						\
+      if (pt->n_##WHICH < 0) BUG;					\
+      FREE(pt->WHICH);							\
+      pt->n_##WHICH = _N_;						\
+      if ((pt->WHICH = (TYPE *) CALLOC(_N_, sizeof(TYPE))) == NULL)	\
+	return ERRORMEMORYALLOCATION;					\
+    } else {								\
+      assert( (_N_ > 0 && pt->WHICH != NULL) || _N_ == 0);		\
+      for (INT iii=0; iii<_N_; pt->WHICH[iii++] = 0);			\
+    }									\
+  }									\
+    TYPE VARIABLE_IS_NOT_USED *WHICH = pt->WHICH
+
+#define CMALLOC(WHICH, N, TYPE)  C_MALLOC(int, WHICH, N, TYPE)
+
+
+/*
+//  sqrtPosDef nutzt pt->U fuer das Ergebnis		
+#define FREEING_RESULT(WHICH)					\
+  assert(int VARIABLE_IS_UNUSED *_i = WHICH);		\
+  if (pt->WHICH != NULL && pt->WHICH != result) {	\
+    UNCONDFREE(pt->WHICH);				\
+    pt->n_##WHICH = 0;					\
+  }		
+*/				
+ 	
+
+double Determinant(double *M, int size, bool logarithm) {
+  Long sizeP1 = size + 1,
+    sizeSq = (Long) size * size;
+  if (logarithm) {
+    double tmp = 0.0;
+    for (Long i=0; i<sizeSq; i+=sizeP1) tmp += LOG(M[i]);
+    return tmp;
+  } 
+  double tmp = 1.0;
+  for (Long i=0; i<sizeSq; i+=sizeP1) tmp *= M[i];
+  return tmp;
+}
+
+double DeterminantLU(double *M, int size, bool logarithm, int *permut) {
+  Long sizeP1 = size + 1;
+    //    sizeSq = size * size;
+  //  for (int i=0;i<sizeSq; i++) {  if (i % size == 0)printf("\n"); printf("%f ", M[i]);  }printf("\n");
+  if (logarithm) {
+    double tmp = 0.0;
+    int vorz= 0;
+    for (int i=0; i<size; i++) {
+      tmp += LOG(FABS(M[i * sizeP1]));
+      vorz += (permut[i] != i+1) + (M[i * sizeP1] < 0);
+      //   printf("tmp %f %d %d %d\n", M[i * sizeP1], vorz , i, permut[i]);
+    }
+    //    printf("RFU det = %f %d\n", tmp, vorz);
+    if (vorz % 2)
+      RFERROR("calculation of log determinant need positive determinant");
+    return tmp;
+  } 
+  double tmp = 1.0;  
+  for (int i=0; i<size; i++) {
+    tmp *= M[i * sizeP1];
+    if (permut[i] != i+1) tmp = -tmp;
+  }
+  //  printf("RFU det = %f\n", tmp);
+  return tmp;
+}
+
+
+double cumProd(double *D, int size, bool logarithm) {
+  if (logarithm) {
+    double dummy = 0.0;
+    for (int i = 0; i < size; dummy += LOG(D[i++]));
+    return dummy;
+  }
+  double dummy = 1.0;
+  for (int i = 0; i < size; dummy *= D[i++]);
+  return dummy;
+}
+
+#define FREEING(WHICH) assert(int VARIABLE_IS_UNUSED *_i=WHICH); FREE0(x,WHICH)
+void solve_DELETE0(solve_storage *x) {
+  FREEING(iwork);
+
+  FREEING(pivotsparse);
+  FREEING(pivot_idx);
+  FREEING(xlnz);
+  FREEING(snode);
+  FREEING(xsuper);
+  FREEING(invp);
+  FREEING(cols);
+  FREEING(rows);
+  FREEING(lindx);
+  FREEING(xja);
+   
+  FREEING(main);
+  FREEING(rhs);
+  FREEING(w2);
+  FREEING(U);
+  FREEING(D);
+    
+  FREEING(w3);
+  FREEING(result);
+  FREE(x->to_be_deleted);
+}
+
+void solve_DELETE(solve_storage **S) {
+  solve_storage *x = *S;
+  if (x!=NULL) {
+    solve_DELETE0(*S);
+    UNCONDFREE(*S);
+  }
+}
+void solve_NULL(solve_storage* x) {
+  if (x == NULL) return;
+  MEMSET(x, 0, sizeof(solve_storage));  
+  x->nsuper = x->size = -1;
+  x->method = NoInversionMethod;
+  for (int i=0; i<SOLVE_METHODS; x->newMethods[i++] = NoInversionMethod);
+  x->actual_pivot = PIVOT_UNDEFINED;
+}
+
+double static inline det3(double *M, int size) {
+  double det;
+  switch(size){ // Abfrage nach Groesse der Matrix M + Berechnung der Determinante per Hand
+  case 1: det = M[0];
+    break;
+  case 2: det = M[0] * M[3] - M[1] * M[2];
+    break;
+  case 3: det = 
+      M[0] * (M[4] * M[8] - M[5] * M[7]) 
+      - M[1] * (M[3] * M[8] - M[5] * M[6]) 
+      + M[2] * (M[3] * M[7] - M[4] * M[6]); // Entwicklung nach 1. Spalte
+    break;
+  default : BUG;
+    break;
+  }
+  return det;
+}
+
+int logdet3(double det, bool posdef, double *logdet, bool logarithm) {
+  if (posdef && det < 0) return ERRORFAILED;
+  if (logdet != NULL) {
+    if (logarithm) {
+      if (det <= 0) return ERRORFAILED;
+      *logdet = LOG(det);
+    } else *logdet = det;
+  }
+  return NOERROR;
+}
+
+int solve3(double *M, int size, bool posdef,
+	   double *rhs, int rhs_cols,
+	   double *result, double *logdet, bool logarithm,
+	   solve_storage *pt	
+	   ){
+  
+  assert(size <= 3);
+  if (size <= 0) SERR0("matrix in 'solvePosDef' of non-positive size.");
+
+  double det = det3(M, size);
+  if (logdet3(det, posdef, logdet, logarithm) != NOERROR) return ERRORFAILED;
+
+  double detinv = 1.0 / det; // determinant of inverse of M
+  
+  switch(size){
+  case 1 : {// size of matrix == 1
+    if (rhs_cols == 0) result[0] = detinv;   
+    else for (int i=0; i<rhs_cols; i++) result[i] = rhs[i] * detinv;
+  }
+    break;
+  case 2 : { // size of matrix == 2
+    double a = M[0] * detinv,
+      d = M[3] * detinv;
+    if (rhs_cols == 0) {
+      result[0] = d;
+      result[1] = -M[1] * detinv;
+      result[2] = -M[2] * detinv;
+      result[3] = a;
+    } else { // rhs_cols != 0
+      double *p = rhs, *q = result;
+      if (M[1] != 0.0 || M[2] != 0.0) {
+	double 
+	  b = M[1] * detinv,
+	  c = M[2] * detinv;
+	for (int i=0; i<rhs_cols; i++, p+=2, q+=2) {
+	  double swap = d * p[0] - c * p[1];
+	  q[1] = a * p[1] - b * p[0];
+	  q[0] = swap;
+	}
+      } else {
+	for (int i=0; i<rhs_cols; i++, p+=2, q+=2) {
+	  double swap = d * p[0];
+	  q[1] = a * p[1];
+	  q[0] = swap;
+	}
+      }
+    }
+  }
+    break;
+  case 3 : {// size of matrix == 3   
+    double swap0 = detinv * (M[4] * M[8] - M[5] * M[7]),
+      swap1 = detinv * (M[5] * M[6] - M[3] * M[8]),
+      swap2 = detinv * (M[3] * M[7] - M[4] * M[6]),
+      swap3 = detinv * (M[2] * M[7] - M[1] * M[8]),
+      swap4 = detinv * (M[0] * M[8] - M[2] * M[6]),
+      swap5 = detinv * (M[1] * M[6] - M[0] * M[7]),
+      swap6 = detinv * (M[1] * M[5] - M[2] * M[4]),
+      swap7 = detinv * (M[2] * M[3] - M[0] * M[5]),
+      swap8 = detinv * (M[0] * M[4] - M[1] * M[3]);
+    if(rhs_cols == 0){ // invert matrix
+      result[0] = swap0;
+      result[1] = swap1;
+      result[2] = swap2;
+      result[3] = swap3;
+      result[4] = swap4;
+      result[5] = swap5;
+      result[6] = swap6;
+      result[7] = swap7;
+      result[8] = swap8;
+    } else { // solve system given by M and rhs
+      double *p = rhs, *q = result;
+      for (int i=0; i<rhs_cols; i++, p+=3, q+=3) {
+	double swapA = p[0] * swap0 + p[1] * swap3 + p[2] * swap6;
+	double swapB = p[0] * swap1 + p[1] * swap4 + p[2] * swap7;
+	q[2] = p[0] * swap2 + p[1] * swap5 + p[2] * swap8;
+	q[0] = swapA;
+	q[1] = swapB;
+      }
+    }
+  }
+    break;
+  default: BUG;
+  }
+  
+  return NOERROR;
+}
+
+int chol3(double *M, int size, double *res, solve_storage *pt){
+  // UNBEDINGT in sqrtRHS.cc auch aendern
+  assert(size <= 3);
+  if (size <= 0) SERR0("matrix in 'solvePosDef' not of positive size.");
+  //  if (M[0] < 0) return ERRORFAILED;
+  res[0] = SQRT(M[0]);
+  if (size == 1) return NOERROR;
+  res[1] = 0.0;
+  res[size] = res[0] > 0.0 ? M[size] / res[0] : 0.0;
+  double dummy = M[size + 1] - res[size] * res[size];
+  res[size + 1] = SQRT(MAX(0.0, dummy));
+  if (size == 2) return NOERROR;
+  res[2] = res[5] = 0.0;
+  res[6] = res[0] > 0.0 ? M[6] / res[0] : 0.0;
+  res[7] = res[4] > 0.0 ? (M[7] - res[3] * res[6]) / res[4] : 0.0;
+  dummy = M[8] - res[6] * res[6] - res[7] * res[7];
+  res[8] = SQRT(MAX(0.0, dummy));
+  return NOERROR;
+}
+
+
+void Sort(double *RESULT, int size, Long rhs_cols, int *pi, int *rank,
+	  double *dummy) {
+  if (size > MAXINT) BUG;
+  orderingInt(pi, (int) size, 1, rank);
+  int i=0;
+  Long totalRHS = size * rhs_cols;
+  while(i < size && i == rank[i]) i++;
+  while (i < size) {
+    int stored_i = i,
+      read = i;
+    double *p_write = NULL,
+      *p_read = RESULT + read;
+    for (Long  k=0; k<rhs_cols; k++) dummy[k] = p_read[k * size];
+    while (true) {
+      int write = read;
+      p_write = RESULT + write;
+      read = rank[write];
+      rank[write] = write;
+      if (read == stored_i) {
+	for (int k=0; k<rhs_cols; k++) p_write[k*size]=dummy[k];
+	break;
+      }
+      p_read = RESULT + read;
+      for (Long k=0; k<totalRHS; k+=size) p_write[k] = p_read[k];
+    }
+    while(i < size && i == rank[i]) i++;
+  }
+}
+
+
+// to do: fehlt: chol2solve(chol, x)
+
+
+void chol2inv(double *MPT, int size, int cores) {
+  Long sizeP1 = size + 1,
+    sizeSq = (Long) size * size;
+  int
+    NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX;
+
+  double *diagonal = (double *) MALLOC(sizeof(double) * (Long) size);
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (size > 60) schedule(dynamic, 20)
+#endif	    
+  for (Long k=0; k<size; k++) {
+    double *p_RESULT = MPT + k * sizeP1,
+      diagK = diagonal[k] = 1.0 / p_RESULT[0];
+    for (Long i=1; i<size - k; i++) {
+      double *pM = p_RESULT + i * size;
+      p_RESULT[i] = (-diagK * pM[0] - SCALAR(pM + 1, p_RESULT + 1, i -1))
+	/ pM[i];
+    }
+    // i == k
+  }
+  
+  int linear_avx = LINEAR_AVX * avxAvail;
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (size > 60) schedule(dynamic, 20)
+#endif
+  for (Long k=0; k<size; k++) {	      
+    double *p_RESULT = MPT + k * size;
+    for (Long i=size-1; i>k; i--) {
+      double *pM = MPT + i * size,
+	r = (p_RESULT[i] /= pM[i]);
+      diagonal[k] -= r *pM[k];
+      LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1);
+      // for (int j=k+1; j<i; j++) p_RESULT[j] -= r * pM[j]; 
+    }
+    // i == k
+  }
+  
+  for (Long k=0; k<size; k++) {
+    double *pM = MPT + k * size;	         
+    pM[k] = diagonal[k] / pM[k];
+  }
+   
+  for (Long i2=0,i=0; i<size; i++, i2+=sizeP1) {
+    Long i3 = i2 + 1;
+    for (Long j = i2 + size; j<sizeSq; j+=size) 
+      MPT[j] = MPT[i3++];
+  }
+  FREE(diagonal);
+
+}
+
+
+
+
+
+int doPosDefIntern(double *M0, int size, bool posdef,
+		   double *rhs0, Long rhs_cols, double *result,
+		   double *logdet, int calculate, solve_storage *Pt,
+		   solve_options *sp, int VARIABLE_IS_NOT_USED cores
+		   ){
+  ASSERT_SOLVE(sp);
+  
+  // it is ok to have
+  // ==15170== Syscall param sched_setaffinity(mask) points to unaddressable byte(s)
+  // caused by gcc stuff
+
+  //  printf("doPosDef %ld %d %d rhs=%ld %d %ld %ld calc=%d %ld %ld\n",
+  //	 M0, size, posdef, rhs, rhs_cols, result, logdet, calculate, Pt,sp);
+  /*
+    M0: (in/out) a square matrix (symmetry is not checked) of size x size;
+    NOTE THAT THE CONTENTS OF M0 IS DESTROYED IFF NO RHS IS GIVEN
+    AND result IS NOT GIVEN.
+    In case rhs is not given, the inverse of M0 is returned 
+    In case of sqrtonly, M0 is expected to be a positive definite matrix
+    posdef (in): whether or not the matrix is positiv (semi)definite --
+    to some extend doPosDef can deal with non-positiv definite
+    functions
+
+    rhs (in/out) : right hand side of the equality with rhs_cols columns
+    NOTE THAT THE CONTENTS OF rhs WILL BE DESTROYED IF rhs IS GIVEN, BUT NOT
+    result  
+ 
+    the solution of the equality is returned in rhs
+    rhs_cols : number of colums of the matrix on the right hand side
+    result (out) : NULL or matrix of the size of the result (inverse matrix or 
+    of size of the matrix on the right hand side);see also 'M0' and 'rhs'
+    logdet (out): if not NULL the logarithm of the determinant is returned
+    pt (in/out) : working space. If NULL, internal working spaces are used.
+ 
+    A non-NULL value gives an advantage only if doPosDef is called
+    repeatedly. Then 
+    solve_storage *pt = (solve_storage*) m alloc(sizeof(solve_storage);
+    solve_NULL(pt);
+    prepares pt. Deletion is done only at the very end by
+    solve_DELETE(pt);
+    In meanwhile the working space is managed by doPosDef;
+    Sp (in): parameters. If NULL, the behaviour is as described in
+    the R manual for doPosDef.
+    The parameters are described in the package 'RandomFields'
+    under ?RFoptions
+	  
+  */
+
+  //  printf("entering\n");
+  // http://www.nag.com/numeric/fl/nagdoc_fl23/xhtml/F01/f01intro.xml#
+  
+  if (rhs_cols > MAXINT) BUG;
+  assert(NA_LOGICAL == INT_MIN && NA_LOGICAL == NA_INTEGER); // nur zur sicherheit, wegen usr_bool
+  //          eigentlich sollte usr_bool unabhaengig davon funktionieren
+  assert(calculate != DETERMINANT ||
+	 (logdet != NULL && result == NULL && rhs0 == NULL));
+  assert(calculate != MATRIXSQRT || (rhs0 == NULL && posdef));
+  assert((rhs_cols != 0) xor (rhs0 == NULL));
+
+  double *RESULT = result != NULL ? result : rhs_cols > 0 ? rhs0 : M0;
+  
+  // Pivot_Cholesky:
+  //	if (MPT == Morig || (rhs_cols > 0 && rhs == RESULT))
+  //	  CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer.");
+
+  // !MATRIXSQRT &&  rhs_cols > 0
+  // assert(rhs != RESULT);
+  la_modes la_mode = OPTIONS.installNrun.la_mode; 
+  if (size <= sp->tinysize) {
+    if (Pt != NULL) {
+      Pt->method = direct_formula;
+      Pt->size = size;
+    }
+    if (calculate == DETERMINANT)
+      return logdet3(det3(M0, size), posdef, logdet, sp->det_as_log);
+    else if (calculate  == MATRIXSQRT) return chol3(M0, size, RESULT, Pt);
+    else return solve3(M0, size, posdef, rhs0,
+		       (int) rhs_cols, RESULT, logdet,
+		       sp->det_as_log, Pt);
+  }
+
+  //  printf("size=%d %d %d sparse=%d\n", size, sp->pivot, PIVOT_AUTO, sp->sparse);
+  // ERR0("XXXX");
+  // printf("%d %d %d\n",  PIVOT_AUTO, direct_formula,Pt->method);
+
+  assert(SOLVE_METHODS >= 2);
+ 
+  //  printf("A\n");
+ 
+  //  printf("A1\n");
+  int  err = NOERROR;
+  Long VARIABLE_IS_NOT_USED spam_zaehler = 0; // if spam is not compiled
+  int nnzA = 0;
+  Long
+    sizeSq = (Long) size * size,
+    sizeRHS = (Long) size * rhs_cols;
+  int sizeP1 = size + 1;
+  //  printf("A1dd\n");
+#if defined compatibility_to_C_h
+   usr_bool sparse = False;
+#else
+    usr_bool sparse = sp->sparse;
+#endif
+  double spam_tol = sp->spam_tol;
+  //   printf("A2\n");
+  bool diag  = false,
+    useGPU = la_mode == LA_GPU &&
+    (calculate == SOLVE || calculate == DETERMINANT);
+ //   printf("A3\n");
+
+
+  //  printf("A %d %d size=%d %d %d \n",	 sparse,Nan ,size, useGPU, sp->n_spam_min[useGPU]);
+  if (sparse == Nan && (sparse = (usr_bool) (size > sp->spam_min_n[useGPU]))) {
+    //     printf("AB2\n");
+    double mean_diag = 0.0;
+    for (Long i=0; i<sizeSq; i += sizeP1) mean_diag += M0[i];
+    mean_diag /= (double) size;
+    spam_tol *= mean_diag;
+    // printf("AC2\n");
+    bool random_sample = sizeSq >= sp->spam_sample_n * 3;
+    if (random_sample) {
+      //  printf("A2E\n");
+      double 
+	thr = sp->spam_sample_n * (1.0 - sp->spam_min_p[useGPU]);
+      Long
+	threshold = (Long) (thr + SQRT(thr) * 3),
+	notZero = 0;
+      for (Long i=0; i<sp->spam_sample_n; i++) {
+	//	printf("A2 %d %d\n", i , sp->spam_sample_n);
+	if ((notZero += !(FABS(M0[(i * sp->spam_factor) % sizeSq]) <=
+			  spam_tol)) >= threshold){
+	  sparse = False;
+	  break;
+	}
+      }
+      if (PL >= PL_FCTN_DETAILS) {
+	PRINTF("random sampling: sparse=%d\n",
+	       sparse == Nan ? NA_INTEGER : (int) sparse);
+      }
+    }
+    ///    printf("A2EX %d %d\n", random_sample, sparse == True);
+    if (!random_sample || sparse == True) {
+      //   printf("AdC2\n");
+      Long diag_nnzA = 0;
+      //#ifdef DO_PARALLEL
+      //#pragma omp parallel for num_threads(cores) schedule(dynamic,10) reduction(+:nnzA,diag_nnzA)
+      //#endif
+      for (Long i=0; i<size; i++) {
+	//	printf("AC2 %d %d\n",i ,size);
+	Long end = i * sizeP1;
+	Long j;
+	// Achtung!!: !(FABS(M0[j++]) <= spam_tol) != FABS(M0[j++]) > spam_tol
+	//             bei NA/NaN
+	for (j=i * size; j<end; nnzA += !(FABS(M0[j++]) <= spam_tol));
+	diag_nnzA += !(FABS(M0[j++]) <= spam_tol);
+	end = (i+1L) * size;
+	if (!posdef) for (; j<end; nnzA += !(FABS(M0[j++]) <= spam_tol));
+      }
+      //     printf("AddC2\n");
+      diag = (nnzA == 0);
+      if (posdef) nnzA *= 2;
+      stopIfNotUInt(diag_nnzA);
+      nnzA += (int) diag_nnzA;
+      sparse = (usr_bool) ((double) nnzA <=
+			   (double) sizeSq * (1.0 - sp->spam_min_p[useGPU]));
+      spam_zaehler = nnzA + 1;
+      //    printf("ddAdC2\n");
+     if (PL >= PL_DETAILSUSER) {
+	if (diag) { PRINTF("diagonal matrix detected\n"); }
+	else if (sparse == True) {
+	  PRINTF("sparse matrix detected (%3.2f%% zeros)\n", 
+		 100.0 * (1.0 - (double) nnzA / (double) sizeSq));
+	} else {
+	  PRINTF("full matrix detected (%3.2f%% nonzeros)\n", 
+		 100.0 * (double) nnzA / (double) sizeSq); }
+      }
+    }
+  } else {
+    //   printf("ttAdC2\n");
+    diag = true;
+    for (Long i=0; i<size && diag; i++) {
+      //      printf("AdC2 %d %d %d\n", i, size, diag);
+      Long j,
+	end = i * sizeP1;
+      for (j=i * size; j<end; j++) {
+	//	printf("(%d %d %10g %d)\n", i, j, M0[j], size);
+	if (!(FABS(M0[j]) <= spam_tol)) {
+	  diag = false;
+	  break;
+	}
+      }
+      if (!diag) break;
+      j++;
+      end = (i+1) * size;
+      if (!posdef) {
+	for (; j<end; j++) {
+	  if (!(FABS(M0[j]) <= spam_tol)) {
+	    diag = false;
+	    break;
+	  }
+	}
+      }
+    }
+  }
+
+
+  solve_storage *pt;
+  if (Pt != NULL) {
+    pt = Pt; 
+  } else {
+    pt = (solve_storage*) MALLOC(sizeof(solve_storage));
+    solve_NULL(pt);    
+  }
+  InversionMethod *Meth = pt->newMethods;
+  pt->method = NoFurtherInversionMethod;
+  pt->size = size;
+
+  
+  //  printf("BA\n");
+  if (diag) {
+    pt->method = Diagonal;
+    if (PL>=PL_STRUCTURE) { PRINTF("dealing with diagonal matrix\n"); }
+    if (logdet != NULL) {
+      *logdet = Determinant(M0, size, sp->det_as_log);
+      if (calculate == DETERMINANT) {
+	err = NOERROR;
+	goto ErrorHandling;
+      }
+    }
+    if (rhs_cols == 0) {
+      MEMCOPY(RESULT, M0, sizeSq * sizeof(*RESULT));
+      if (calculate == MATRIXSQRT) {
+	for (Long i=0; i<sizeSq; i += sizeP1) {
+	  RESULT[i] = M0[i] > 0.0 ? SQRT(M0[i]) : 0.0;	
+	}
+      } else {
+	for (Long i=0; i<sizeSq; i += sizeP1) 
+	  RESULT[i] = M0[i] <= 0.0 ? 0.0 : 1.0 / M0[i];
+      }
+    } else {
+      CMALLOC(main, size, double);
+      for (Long i=0; i<size; i++) {
+	Long idx = i * sizeP1;
+	main[i] = M0[idx] == 0.0 ? 0.0 : 1.0 / M0[idx];
+      }
+      Long j;
+      for (Long k=j=0; j<rhs_cols; j++)
+	for (Long i=0; i<size; i++, k++) RESULT[k] = rhs0[k] * main[i];
+    }
+    
+    err = NOERROR;
+    goto ErrorHandling;
+  }
+
+  
+  // size of matrix at least 4 x 4, and not diagonal
+  int to, from;
+  from = 0;
+  if ((to = sparse == True)) Meth[0] = Sparse;
+  while (from < SOLVE_METHODS &&
+	 to < SOLVE_METHODS &&
+	 sp->Methods[from] != NoFurtherInversionMethod &&
+	 sp->Methods[from] != NoInversionMethod) {
+    if (sp->Methods[from] == Sparse && sparse == True) from++;
+    else Meth[to++] = sp->Methods[from++];
+  }
+
+  //  printf("from = %d (%d %d) [%d %d %d] sparse=%d %d\n", from, sp->Methods[0], sp->Methods[1], Meth[0],  Meth[1], Meth[2], sparse == True, Sparse);
+  
+  if (from == 0) { // user did not give any method
+    if (posdef) {
+      if (to < SOLVE_METHODS) {
+	Meth[to++] = useGPU ? GPUcholesky : Cholesky;
+	if (to < SOLVE_METHODS) {
+	  Meth[to++] = sp->pivot_mode != PIVOT_NONE && useGPU ? Cholesky : Eigen;
+	}
+      }
+    } else {
+      Meth[to++] = LU;
+    }
+  } else {
+    if (useGPU) 
+      for (int i=0; i<SOLVE_METHODS; i++)
+	if (Meth[i] == Cholesky || Meth[i] == GPUcholesky){
+	  Meth[i]=GPUcholesky;
+	  break; // so "Cholesky" can be given several times, only
+	  // the very first will be changed to GPUcholesky.
+	  // if GPU is already given do not change any given Cholesky afterwards
+	}
+  }
+
+  //  printf("to = %d (%d %d) %d %d\n", to, Meth[0], Meth[1], from, sparse);  assert(Meth[0] != 4);
+  
+  for (; to<SOLVE_METHODS; Meth[to++]=NoFurtherInversionMethod);//save
+
+  
+  
+  // cholesky, QR, SVD, Eigen, LU always destroy original matrix M
+  int first_not_reading_M0;
+  double *MPT, *RHS;
+  MPT = M0;// pointer of M matrix, die zerstoert wird
+  RHS = rhs0; 
+  first_not_reading_M0 = 0;
+
+  if (rhs_cols == 0 && result != NULL) MPT = result;
+  else {
+    first_not_reading_M0 = sparse + (Meth[sparse] == GPUcholesky);
+    // printf("first_not_reading_M0 %d %d\n", first_not_reading_M0, Meth[first_not_reading_M0] != Meth[first_not_reading_M0 - 1] &&
+    //	 Meth[first_not_reading_M0] != NoFurtherInversionMethod);
+    
+    if (first_not_reading_M0 == 0 ||
+	(Meth[first_not_reading_M0] != Meth[first_not_reading_M0 - 1] &&
+	 Meth[first_not_reading_M0] != NoFurtherInversionMethod)) {
+      //printf("rhs_cols +%d\n", rhs_cols);
+      if (rhs_cols > 0
+	  ||
+	  (SOLVE_METHODS > first_not_reading_M0 + 1 &&
+	   Meth[first_not_reading_M0 + 1] != Meth[first_not_reading_M0] &&
+	   Meth[first_not_reading_M0 + 1] != NoFurtherInversionMethod)
+	  ||
+	  (Meth[first_not_reading_M0] == SVD && sp->svd_tol > 0.0 &&
+	   calculate != SOLVE)
+	  ) { // at least two different Methods in the list
+	C_MALLOC(Long, main, sizeSq, double);//to pt->main, main local variable
+	MPT = pt->main;
+	if (rhs_cols > 0) {
+	  C_MALLOC(Long, rhs, sizeRHS, double);//to pt->main, main local variab
+	  RHS = pt->rhs;
+	}
+      }
+    }
+  }
+
+  
+  // printf("AFF\n");
+
+  errorstring_type ErrStr;
+  STRCPY(ErrStr, "");
+
+  //  printf("Meth=%d %d Chol=%d %d posdef=%d\n", Meth[0], Meth[1], Cholesky, SOLVE_METHODS, posdef);
+
+  //  for (int i=0; i<size; i++) { printf("\n");
+  //   for (int j=0; j<size; j++)  printf("%e ", M0[i * size + j]);
+  // } printf("\n");
+
+ 
+  int proposed_pivot;
+  proposed_pivot = sp->pivot_mode;
+  for (int m=0; m<SOLVE_METHODS && (m==0 || Meth[m] != Meth[m-1]); m++) {
+    pt->method = Meth[m];
+    if (pt->method == Cholesky && size > OPTIONS.installNrun.LaMaxTakeIntern) {
+      pt->method = calculate == DETERMINANT ? LU : Rcholesky;
+    }
+    if (pt->method < 0) break;
+    if (calculate != SOLVE) {
+      if (pt->method == NoInversionMethod && m<=sparse) BUG;
+      if (pt->method == NoFurtherInversionMethod) break;
+      if (PL>=PL_STRUCTURE) { 
+	PRINTF("method to calculate the square root : %s\n", 
+	       InversionNames[pt->method]);
+      }
+    } else {
+      if (PL>=PL_STRUCTURE) { 
+	PRINTF("method to calculate the inverse : %s\n",
+	       InversionNames[pt->method]);
+      }
+    }
+     
+    if (MPT != M0 && m >= first_not_reading_M0)
+      MEMCOPY(MPT, M0,  sizeSq * sizeof(*MPT));
+
+    if (RHS != rhs0) MEMCOPY(RHS, rhs0,  (Long) sizeRHS * sizeof(*RHS));
+ 
+    switch(pt->method) {
+    case GPUcholesky :
+      if (size > 28000)
+	GERR0("due to a bug at NVIDIA the maximum size is currently limited to 28000.");
+      
+      if (!posdef) CERR0("Cholesky needs positive definite matrix");
+#ifdef USEGPU
+      if (proposed_pivot > PIVOT_AUTO)
+	GERR0("cholesky decomposition on GPU does not allow for pivoting");
+      pt->actual_pivot = PIVOT_NONE;
+      {
+	double LD,
+	  *LogDet = logdet == NULL ? &LD : logdet;
+	err = // int; see errors_messages.h for values 0...4
+	  cholGPU(true, // bool : in : says that values must be copied
+		  // so in miraculix, RFU_AlexChol(false, ....)
+		  // can be called
+		  M0,// in: this matrix is copied by Alex because
+		  //               of value 'true' in the first argument,
+		  //               so contents never distroyed by Alex
+		  size, // in:  size of the matrix
+		  rhs0, //in: if NULL the inverse of M is calculated;
+		  //  rhs is copied by Alex  because of value 'true'
+		  //  in the first argument,, so never distroyed by Alex
+		  rhs_cols, // in: number of columns on the right hand side
+		  LogDet, // out : logarithm of the determinant of
+		  //               the sqare root(!) of the matrix M
+		  RESULT); // out: a pointer to the result whether or
+	// not rhs is given
+	if (err != NOERROR) {
+	  if (proposed_pivot == PIVOT_AUTO) proposed_pivot = PIVOT_DO;
+	  continue;
+	}
+	if (logdet != NULL) {
+	  *logdet *= 2;
+	  if (!sp->det_as_log) *logdet = EXP(*logdet);
+	  if (calculate == DETERMINANT) {
+	    err = NOERROR;
+	    goto ErrorHandling;
+	  }
+	}
+      }
+#else
+      //     err = NOERROR;
+      BUG;
+#endif
+      break;
+    case Rcholesky : {
+      //      printf("chol (R)\n");
+      if (calculate == SOLVE) {
+	if (rhs_cols > MAXINT) BUG;
+        int n_rhs = (int) rhs_cols;
+	double *mm = NULL;
+	CMALLOC(xja, size, int);
+	if (rhs_cols == 0) {
+	  //	 printf("hier %ld %ld %ld res=%ld %ld %ld\n", RESULT, MPT, M0, result, rhs0, RHS);
+	  if (RESULT == MPT){
+	    Long bytes = sizeSq * sizeof(*RESULT);
+	    mm = (double *) MALLOC(bytes);
+	    MEMCOPY(mm, M0, bytes);
+	  }
+	  
+	  MEMSET(RESULT, 0, sizeof(*RESULT) * sizeSq);
+	  for (Long i=0; i<sizeSq; i+=sizeP1) RESULT[i] = 1.0;
+	  if (size > MAXINT) BUG;
+ 	  n_rhs = size;
+	  
+	  /*
+	    
+	    for (Long i=0; i<size; i++)  {
+	    for (Long j=0; j<size; j++) {
+	    //printf("%f ", RESULT[i + j * size]); 
+	    }
+//	    printf("\n");
+	    }
+//	    printf("\n M\n");
+	    for (Long i=0; i<size; i++)  {
+	    for (Long j=0; j<size; j++) {
+//	    printf("%f ", MPT[i + j * size]);
+	    }
+//	    printf("\n");
+	    }
+	    
+	  */
+	  
+	} else {
+	  if (RHS != RESULT)
+	    MEMCOPY(RESULT, RHS, sizeof(*RESULT) * (Long) size * rhs_cols);
+	}
+	F77dgesv(&size, &n_rhs, mm == NULL ? MPT : mm, &size, xja, RESULT,
+		 &size, &err);
+	if (logdet != NULL)
+	  *logdet = DeterminantLU(mm == NULL ? MPT : mm, size, sp->det_as_log,
+				  xja);
+	FREE(mm); // NOTE: return / errrors only afterwards!	
+	if (err != NOERROR) GERR0("LU algorithm failed.");
+      } else if (calculate == MATRIXSQRT) {
+	if (MPT != RESULT) MEMCOPY(RESULT, MPT, sizeSq * sizeof(*RESULT));
+        F77dpotrf("U", &size, RESULT, &size, &err
+#ifdef USE_FC_LEN_T	  
+		  FCONE
+#endif		  
+		  );
+	if (logdet != NULL) {
+	  Determinant(RESULT, size, sp->det_as_log);
+	  if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet;
+	}
+	Long sizeM1 = size - 1;
+	for (Long i=0; i<sizeM1; i++)
+	  MEMSET(RESULT + i * sizeP1 + 1, 0,
+		 sizeof(*RESULT) * ((Long) size - i - 1));
+	// inversion:
+	// F77dpotri("U", &sz, REAL(ans), &sz, &info FCONE);
+      } else BUG;
+      if (err != NOERROR) {
+	CERR1("algorithm failed (err=%d). Consider 'RFoptions(la_mode=\"intern\", pivot=PIVOT_AUTO)'", err);
+      }
+      break;
+    }
+    case Cholesky : {
+      //      printf("chol\n");
+      //
+#ifdef DO_PARALLEL
+      //   printf("chol (own), %d cores\n",  cores);
+#else
+      //   printf("chol (own), single core\n");
+#endif
+#define C_GERR0(X,G) {STRCPY(ErrStr, X); FERR0(X); err = ERRORM; goto G;}
+#define C_GERR1(X,Y,G) {SPRINTF(ErrStr,X,Y); FERR0(ErrStr);err = ERRORM; goto G;}
+#define C_GERR2(X,Y,Z,G){SPRINTF(ErrStr,X,Y,Z);FERR0(ErrStr);err=ERRORM; goto G;}
+#define C_GERR3(X,Y,Z,A,G) {SPRINTF(ErrStr,X,Y,Z,A); FERR0(ErrStr);err = ERRORM; goto G;}
+      int NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX,
+	linear_avx = LINEAR_AVX * avxAvail;
+      if (size > sp->max_chol) {
+	CERR2("Matrix  is too large for Cholesky decomposition. Maximum ist currently a %d x %d matrix. Increase 'max_chol' in 'RFoption' if necessary.",
+	      sp->max_chol, sp->max_chol);
+      }
+
+      ///      printf("size = %d %d %d\n", size, rhs_cols, size > rhs_cols ? size : rhs_cols);
+      
+      C_MALLOC(Long, D, size > rhs_cols ? size : rhs_cols, double);
+      for (Long i=0; i<size; i++) D[i] = MPT[sizeP1 * i];
+      
+      pt->actual_pivot = PIVOT_UNDEFINED;
+   
+      if (proposed_pivot == PIVOT_NONE || proposed_pivot == PIVOT_AUTO) {
+
+	// cmp for instance http://stackoverflow.com/questions/22479258/cholesky-decomposition-with-openmp
+
+	// obere und untere dreiecksmatrix wird gelesen und obere geschrieben
+	err = NOERROR;
+	pt->actual_pivot = PIVOT_NONE;
+	{
+	  double *A = MPT;
+	  for (Long i=0; i<size; i++, A += size) {
+	    double sclr = SCALAR(A, A, i);
+	    if (A[i] <= sclr) {
+	      if (proposed_pivot == PIVOT_NONE)
+		C_GERR2("Got %10e as %d-th eigenvalue.",
+			A[i] - sclr, (int) i, ERR_CHOL)
+		else C_GERR2("Got %10e as %d-th eigenvalue.",
+			     A[i] - sclr, (int) i, Pivot_Cholesky);
+	      break;
+	    }
+	    A[i] = SQRT(A[i] - sclr);
+
+  
+	    //	  double invsum = 1.0 / A[i];
+	    double sum = A[i];
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (MULTIMINSIZE(size - i)) schedule(dynamic, 8) 
+#endif
+	    //	    for (double *B=MPT + (i+1) * size; B<endB; B+=size) {
+	    for (Long j = i + 1; j < size; j++) {
+	      double *B = MPT + j * size;
+	      B[i] = (B[i] - SCALAR(A, B, i)) / sum;
+	    }
+	  }
+	}
+      
+	if (err == NOERROR) {
+	  if (calculate == MATRIXSQRT) {
+	    int deltaend = size - 1;
+	    double *end = MPT + sizeSq;
+	    for (double *p=MPT + 1; p<end; p+=sizeP1, deltaend--)
+	      FILL_IN(p, deltaend, 0.0);	    
+	  } else {
+	    if (logdet != NULL) {
+	      *logdet = Determinant(MPT, size, sp->det_as_log);
+	      if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet;
+	      if (calculate == DETERMINANT) {
+		err = NOERROR;
+		goto ErrorHandling;
+	      }
+	    }
+
+	    if (rhs_cols == 0) chol2inv(MPT, size, cores);
+	    else { // rhs_cols > 0
+	      //Long totalRHS = size * rhs_cols;
+	      //if (result!=NULL) MEMCOPY(RESULT, rhs, sizeof(*RESULT)*totalRHS);
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (rhs_cols > 30) schedule(static)
+#endif	      
+	      for (Long k=0; k<rhs_cols; k++) {
+		double *p_RESULT = RESULT + k * size,
+		  *p_rhs = RHS + k * size;
+		for (Long i=0; i<size; i++) {
+		  double *pM = MPT + i * size;
+		  p_RESULT[i] = (p_rhs[i] - SCALAR(pM, p_RESULT, i)) / pM[i];
+		}
+	      }
+	      
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (rhs_cols > 30) schedule(static)
+#endif	      
+	      for (Long k=0; k<rhs_cols; k++) {	      
+		double *p_RESULT = RESULT + k * size;
+		for (Long i=size-1; i>=0; i--) {
+		  double *pM = MPT + i * size,
+		    r = (p_RESULT[i] /= pM[i]);
+		  LINEAR(pM, -r, i, p_RESULT);
+		  // for (Long j=0; j<i; j++)  p_RESULT[j] -= r * pM[j];
+		}
+	      }
+	    }
+	  } // not sqrt only
+	} // err == NOERROR
+      } else err = ERRORFAILED;
+      
+    
+      Pivot_Cholesky:
+      //      printf("Err pivot %d %s %d\n", err, ErrStr, proposed_pivot );
+      if (err != NOERROR && proposed_pivot != PIVOT_NONE) {
+	if (PL > PL_DETAILS) { PRINTF("trying pivoting\n"); }
+	int actual_size = NA_INTEGER;
+	// code according to Helmut Harbrecht,Michael Peters,Reinhold Schneider
+	// talk: The pivoted Cholesky decomposition and its application to
+	//       stochastic PDEs
+	// ebenso: untere dreiecksmatrix wird gelesen; obere geschrieben
+	if (pt->actual_pivot == PIVOT_NONE) {
+	  // wiederherstellung der Diagonalen und der unteren dreiecksmatrix
+	  for (Long i=0; i<size; i++) MPT[sizeP1 * i] = D[i];
+	}
+	int *pi;
+	if (proposed_pivot == PIVOT_DO || proposed_pivot == PIVOT_AUTO) {
+ 	  FREE(pt->pivot_idx); // ALWAYS FREE IT!!! cp Chol(S E X P M)
+	  pt->pivot_idx = (int*) MALLOC((Long) size * sizeof(int));
+	  pt->n_pivot_idx = size;
+	  pt->actual_pivot = PIVOT_DO;
+	  for (int i=0; i<size; i++) pt->pivot_idx[i] = i;
+	  pt->actual_size = actual_size = size;
+	  pi = pt->pivot_idx;
+	} else { // PIVOT_IDX 
+	  if (sp->n_pivot_idx < size || sp->actual_size > size) {
+	    //	    printf("XA, %d %d %d\n", sp->n_pivot_idx , size, sp->actual_size);
+	    CERR0("pivot_idx does not have the correct length.\nSet 'RFoption(pivot_idx=, pivot_actual_size=)' to the attributes of a\npivoted Cholesky decomposition.");
+	  }
+	  actual_size = pt->actual_size = sp->actual_size;
+	  if (actual_size > size) BUG;
+	  FREE(pt->pivot_idx);
+	  Long bytes = (Long) size * sizeof(*(pt->pivot_idx));
+	  pt->pivot_idx = (int*) MALLOC(bytes);
+	  MEMCOPY(pt->pivot_idx, sp->pivot_idx, bytes);
+	  pt->actual_pivot = PIVOT_IDX;
+	  pt->n_pivot_idx = sp->n_pivot_idx;
+	  pi = sp->pivot_idx;
+	}
+
+	err = NOERROR;
+	//	printf("hier\n");
+	double // *rhs = rhs,
+	  //	  *M00 = M0,
+	  rel_thres = 0,
+	  max_deviation = sp->max_deviation, //  1e-10,
+	  max_reldeviation = sp->max_reldeviation; //  1e-10,
+
+	//printf("MTP %d %d %d\n", MPT == M0, rhs_cols,  RHS == RESULT);
+	if (MPT == M0 || (rhs_cols > 0 && RHS == RESULT))
+	  CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer.");
+	/* 
+	if (MPT == M0) {
+	  CMALLOC(main, sizeSq, double);
+	  MEMCOPY(main, M0, sizeSq * sizeof(*main));
+	  M00 = main;	  
+	}
+	if (rhs_cols > 0 && rhs == RESULT) {
+	  Long totalRHS = (Long) size * rhs_cols;
+	  CMALLOC(U, totalRHS, double);
+	  MEMCOPY(U, rhs, totalRHS * sizeof(*U));
+	  RHS = U;
+	  }*/
+	for (Long q=0; q<actual_size; q++) {
+	  if (pt->actual_pivot == PIVOT_DO) {
+	    double 
+	      max = RF_NEGINF,
+	      deviation = 0.0;
+	    Long k,
+	      argmax = NA_INTEGER;	    
+	    for (k=q; k<size; k++) {
+	      double dummy = D[pi[k]];
+	      // if (D[pi[k]] < 0)
+
+	      //printf("k=%d %10e %10e\n", k, du1mmy, -1e-15 * size * size);
+	      if (dummy < -1e-4 * sp->pivot_relerror * (double) sizeSq){
+		C_GERR1("matrix not positive definite or increase 'pivot_relerror' by at least factor %10g.", dummy * -1e4 / (double) sizeSq, ERR_CHOL);
+	      }
+	      deviation += dummy;
+	      if (max < dummy) {
+		max = dummy;
+		argmax = k;		    
+	      }
+	    }
+	    
+	    double dev = rel_thres * max_reldeviation;
+	    if (deviation <= max_deviation || (q > 0 && deviation <= dev) ) {
+	      if (q > MAXINT) BUG;
+	      actual_size = pt->actual_size = (int) q;
+	      if (sp->pivot_check != False) {
+		double largest = 0;
+		for (Long i=q; i<size; i++) {
+		  double *mpt = MPT + pi[i] * (Long) size;
+		  for (Long j=q; j<=i; j++) {
+		    double absm = FABS(mpt[j]);
+		    largest = absm > largest ? absm : largest;
+		    // if(absm == largest || absm > 5) printf("%10e %d %d; %d\n", absm, i, j, size);
+		  }
+		}
+		if (largest > max_deviation || (q > 0 && largest > dev)) {
+		  char msg[500];
+		  SPRINTF(msg, "Matrix has a numerically zero, leading value at the %d-th pivoted line, but the largest deviation %10e from zero in the rest of the matrix is greater than the tolerance %10e. %.50s.",
+			  (int) q,
+			  largest,
+			  MAX(max_deviation, dev),
+			  sp->pivot_check == True
+			  ? "If you are sure that the matrix is semi-definite, set 'RFoptions(pivot_check=NA)' or 'RFoptions(pivot_check=True)'"
+			  : "The result can be imprecise");
+		  if (sp->pivot_check == True) C_GERR0(msg, ERR_CHOL)
+		    else WARN0(msg);
+		}
+	      }
+	      break;
+	    }
+	    rel_thres += D[pi[q]];
+	    int dummy = pi[q];
+	    pi[q] = pi[argmax];
+	    pi[argmax] = dummy;
+	  }
+	  
+	  
+	  Long pqq = pi[q],
+	    col_q = pqq * size;
+
+	  
+	  if (D[pqq] < 0) {
+	    C_GERR1("Negative leading value found at the %d-th pivoted line.",
+		    (int) q, ERR_CHOL);
+	  }
+	  double lqpq = MPT[q + col_q] = SQRT(D[pqq]);	    
+
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (MULTIMINSIZE(size - q)) schedule(dynamic, 8) 
+#endif
+	  for (Long i=q+1; i<size; i++) {
+	    Long pii = pi[i];
+	    Long col_i = pii * size;
+	    assert(pii != pqq);
+	    double scalar = SCALAR(MPT + col_q, MPT + col_i, q);
+	    MPT[q + col_i] = (M0[pqq + col_i] - scalar) / lqpq;
+	    D[pii] -=  MPT[q + col_i] *  MPT[q + col_i];
+	    // in Harbrecht: lqpq * MPT[q + col_i];
+	  }
+	  
+	  /*	  if (!true) {
+		  for (Long k=0; k<size; k++) {  
+		  for (Long j=0; j<size; j++)
+		  p rintf("%10g ", MPT[size * j + k]);
+		  p rintf("\n");
+		  } p rintf("\n");
+		  }
+	  */
+	  
+	} // for q
+
+	
+	if (err == NOERROR) {
+	  if (calculate == MATRIXSQRT) {
+	    Long i = 0;
+	    for ( ; i<actual_size; i++) {
+	      FILL_IN(MPT + i + 1 + size * pi[i], size - 1 - i, 0.0);
+	    }
+	    for ( ; i<size; i++) {
+	      FILL_IN(MPT + actual_size + (Long) size*pi[i], size - actual_size, 0.0);
+	    }
+	    
+	  } else { // ! MATRIXSQRT
+	    if (logdet != NULL) {
+	      Long N = size;
+	      if (pt->actual_pivot == PIVOT_DO) {
+		N = actual_size;
+	      }
+	      if (sp->det_as_log) {
+		if (N < size && !sp->pivot_partialdet) *logdet = RF_NEGINF;
+		else {
+		  double logD = 0.0;
+		  for (Long i=0; i < N; i++)
+		    logD += LOG(MPT[i + pi[i] * (Long) size]);
+		  *logdet = logD * 2;
+		}
+	      } else {
+		if (N < size && !sp->pivot_partialdet) *logdet = 0;
+		else {
+		  double logD = 1.0;
+		  for (Long i=0; i < N; i++)
+		    logD *= MPT[i + pi[i] * (Long) size];
+		  *logdet = logD * logD;
+		}
+	      }
+	      if (calculate == DETERMINANT) {
+		err = NOERROR;
+		goto ErrorHandling;
+	      }
+	    }
+	    
+	    //////////////////////////////////////////////////
+	    //////////////////////////////////////////////////
+
+	    if (rhs_cols == 0) {
+ 	      if (actual_size < size) 
+		GERR0("Matrix not definite. Try ")
+	      
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (size > 60) schedule(dynamic, 20)
+#endif
+		  for (Long k=0 ; k<actual_size; k++) {
+		    double *p_RESULT = MPT + pi[k] * (Long) size + k,
+		      diagK = D[k] = 1.0 / p_RESULT[0];
+		    for (Long i=1; i<size - k; i++) {
+		      double *pM = MPT + k + pi[k + i] * (Long) size;
+		      p_RESULT[i] = (-diagK * pM[0]
+				     -SCALAR(pM + 1, p_RESULT + 1, i -1)) / pM[i];
+		    }
+		  }     
+	      
+	      
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (size > 60) schedule(dynamic, 20)
+#endif	    
+	      for (Long k=0; k<size; k++) {	      
+		double *p_RESULT = MPT + pi[k] * (Long) size;
+		for (Long i=actual_size-1; i>k; i--) {
+		  double *pM = MPT + pi[i] * (Long) size,		    
+		    r = (p_RESULT[i] /= pM[i]);
+		  D[k] -= r * pM[k];
+		  LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1);
+		  // for (Long j=k+1; j<i; j++) p_RESULT[j] -= r * pM[j]; 
+		}
+		// i == k
+	      }
+	      
+	      for (Long k=0; k<size; k++) {	      
+		double *p_RESULT = MPT + pi[k] * (Long) size;
+		for (Long q=0; q<k; p_RESULT[q++] = RF_INF);
+		//	for (Long q=actual_size; q<size; p_RESULT[q++] = RF_NA);
+	      }
+	      
+	      for (Long k=0; k<actual_size; k++) {
+		double *pM = MPT + pi[k] * (Long) size;	         
+		pM[k] = D[k] / pM[k];
+	      }
+	      
+	      CMALLOC(xja, size, int);
+	      Sort(RESULT, size, size, pi, xja, D);
+	      for (Long i=0; i<size; i++) {
+		for (Long j=i+1; j<size; j++) {
+		  Long idx = i + j * size;
+		  if (MPT[idx] == RF_INF) MPT[idx] = MPT[j + i * size];
+		  else MPT[j + i * size] = MPT[idx];
+		}
+	      }
+	      
+
+	      //////////////////////////////////////////////////
+	      //////////////////////////////////////////////////
+	      
+	    } else { // rhs_cols > 0
+	      // if (rhs0 == RESULT) {
+	      //	/* crash(); */
+	      //	#pragma GCC diagnostic push
+	      //#pragma GCC diagnostic ignored "-Wuninitialized"
+	      // Long i; PRINTF("%d\n", i);char m[1];m[i] = m[i-9] + 4; if (m[0]) i++; else i--; PRINTF("%s\n", m); // not MEMCOPY
+	      //#pragma GCC diagnostic pop
+  //  int *x = (int*) MALLOC(1000000);  f ree(x);  f ree(x); x[100] = 100;
+	      //    }
+	      //  printf("%ld %ld %ld\n", rhs0 ,  RESULT, RHS);
+	      //assert(rhs0 != RESULT); 
+	      // assert(RHS != RESULT); 
+	      double eps = D[0] * sp->pivot_relerror;
+			      
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (rhs_cols > 30) schedule(static)
+#endif
+	      for (Long k=0; k<rhs_cols; k++) {
+		double *p_RESULT = RESULT + k * size,
+		  *p_rhs = RHS + k * size;
+		Long i=0;
+		for ( ; i<actual_size; i++) {
+		  Long pii = pi[i];
+		  double *pM = MPT + pii * size;
+		  p_RESULT[i] = (p_rhs[pii] - SCALAR(pM, p_RESULT, i)) / pM[i];
+		}		
+	 	for ( ; i<size; i++) {
+		  Long pii = pi[i];
+		  double *pM = MPT + pii * size;
+		  p_RESULT[i] = 0.0;
+		  if (FABS((p_rhs[pii] - SCALAR(pM, p_RESULT, i))) > eps) {
+		    if (Pt == NULL) solve_DELETE(&pt);
+#ifdef DO_PARALLEL
+		    RFERROR("Equation system not solvable"); // RFERROR necessary.
+#else		    
+		    GERR1("Equation system not solvable (difference %10e). Try increasing 'pivot_relerror' in 'RFoptions' to get an approximate solution.",
+			 p_rhs[pii] - SCALAR(pM, p_RESULT, i));
+#endif
+		    
+		  }
+		}
+	      }
+
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) schedule(static) if (rhs_cols > 30)
+#endif 
+	      for (Long k=0; k<rhs_cols; k++) {	      
+		double *p_RESULT = RESULT + k * size;
+		for (Long i=actual_size - 1; i>=0; i--) {
+		  Long pii = pi[i];
+		  double *pM = MPT + pii * size,
+		    r = (p_RESULT[i] /= pM[i]);
+		  LINEAR(pM, -r, i, p_RESULT);
+		  // for (Long j=0; j<i; j++) p_RESULT[j] -= r * pM[j];
+		}
+	      }
+	      CMALLOC(xja, size, int);
+	      Sort(RESULT, size, rhs_cols, pi, xja, D);
+	    }// rhs_cols > 0
+	  } // not sqrt only
+
+	} // err == NOERROR	
+      }
+
+      ERR_CHOL:
+      
+      if (err != NOERROR) {	
+	if (pt->actual_pivot == PIVOT_NONE)
+	  CERR2("Probably matrix not positive definite: %.300s. Consider 'RFoptions(%.80s)'.\n",
+		ErrStr,
+		calculate != SOLVE || rhs_cols > 0  ?
+		"'pivot=PIVOT_AUTO)' or 'RFoptions(pivot=PIVOT_DO"
+		: "solve_method=\"eigen\", solve.pseudoinverse=TRUE") // OK
+
+	    
+	  else // pt->actual_pivot == PIVOT_DO  or  PIVOT_IDX
+	    CERR1("Likely, the matrix is not positive semi definite: %.300s. Consider 'RFoptions(solve_method=\"svn\"\n", ErrStr)
+	      }
+      
+      if (PL >=  PL_DETAILSUSER) {
+	PRINTF("Cholesky decomposition successful\n");
+      }
+      }
+      break;
+    case QR : {// QR returns transposed of the inverse !! 
+      if (rhs_cols > 0 || logdet != NULL || calculate != SOLVE) {
+	err = ERRORFAILED;
+	continue;
+      }
+
+        err = ERRORNOTPROGRAMMEDYET; /// to do: clarify transposed !
+      continue;
+
+      CMALLOC(w2, size, double);
+      CMALLOC(w3, size, double);
+
+      F77dgeqrf(&size, &size, // QR
+		       MPT, &size, // aijmax, &irank, inc, w2, 
+		       w3, w2, &size, &err);     
+      assert(false); // code zur invertierung fehlt hier!
+      
+      if (err != NOERROR) {	
+	CERR1("'dgeqrf' failed with err=%d.", err);
+      }
+      if (PL >=  PL_DETAILSUSER) { PRINTF("QR successful\n"); }
+      break;
+    }
+      
+    case Eigen : { //  M = U D UT
+      int max_eigen = sp->max_svd;
+      double eigen2zero = sp->eigen2zero;
+      if (size > max_eigen) CERR0("matrix too large for Cholesky or eigen value decomposition. Increase 'max_chol' and 'max_svd' in 'RFoption' if necessary.");
+      
+      double 
+	optimal_work,
+	*pt_work = &optimal_work;
+      int k=0,
+	optimal_intwork,
+	*pt_iwork = &optimal_intwork, 
+	lw2 = -1,
+	lintwork = -1;
+      
+      C_MALLOC(Long, U, sizeSq, double);
+      CMALLOC(D, size, double); 
+      CMALLOC(xja, 2 * (Long) size, int);
+      CMALLOC(w3, size, double);
+ 
+      for (int i=0; i<=1; i++) {
+	double dummy = 0.0,
+	  abstol = 0.0;
+	int dummy_nr;
+
+	//	printf("%f %f %f %f\n", MPT[0],  MPT[1],  MPT[2],  MPT[3]);
+	F77dsyevr("V", "A", "U", &size, // Eigen
+			 MPT, &size, &dummy, &dummy, &k, &k, 
+			 &abstol,// or DLAMCH
+			 &dummy_nr, D, U, &size, 
+			 xja, // 2 * size * size of(integer); nonzeros_idx
+			 pt_work, &lw2,
+			 pt_iwork, &lintwork,
+			 &err
+#ifdef USE_FC_LEN_T	  
+		  FCONE FCONE FCONE
+#endif		  
+			 );
+	//	printf("eigen %d %f %f;  %e %e %e %e\n", err, D[0], D[1], U[0], U[1], U[2], U[3]);
+	if (i==1 || err != NOERROR || ISNAN(D[0])) break;
+	lw2 = (int) optimal_work;
+	lintwork = (int) optimal_intwork;
+	CMALLOC(w2, lw2, double);
+	CMALLOC(iwork, lintwork, int);
+	pt_iwork = iwork;
+	pt_work = w2; 
+      }
+
+      
+      if (err != NOERROR) {
+	if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dsyevr' = %d\n", err);}
+	CERR1("'dsyevr' failed with err=%d.", err);
+	break;
+      }
+      
+      for (Long i=0; i<size; i++)
+	if (D[i] < -eigen2zero) {
+	  const char
+	    *advice[2]={"", " Consider increasing the value of 'eigen2value'."};
+	  double min  = D[i];
+	  for (Long j=i+1; j<size; j++) if (D[j] < min) min = D[j];
+	  GERR3("Negative eigenvalues found (less than -eigen2zero=%10e). Smallest one equals %10e. %.50s", -eigen2zero, min, advice[min > -eigen2zero * 100]);
+	  
+	} 
+      
+      if (calculate == MATRIXSQRT) {
+	for (Long j=0; j<size; j++) {
+	  double dummy;
+	  dummy = 0.0;
+	  if (D[j] >= eigen2zero) dummy = SQRT(D[j]);
+	  for (Long i=0; i<size; i++, k++) RESULT[k] = U[k] * dummy;
+	}
+      } else {
+	// calculate determinant 
+	if (logdet != NULL) {
+	  *logdet = cumProd(D, size, sp->det_as_log);
+	  if (calculate == DETERMINANT) {
+	    err = NOERROR;
+	    goto ErrorHandling;
+	  }
+	}
+
+	//	printf("Hiere EIGGEN\n");
+	bool pseudoInverse = false;
+	for (Long j=0; j<size; j++) {
+	  pseudoInverse |= D[j] < eigen2zero;
+	  w3[j] = D[j] < eigen2zero ? 0.0 : 1.0 / D[j];
+	}
+	
+	if (rhs_cols > 0) {
+	  Long tot = (Long) size * rhs_cols;
+	  C_MALLOC(Long, w2, tot, double);	
+	  matmulttransposed(U, RHS, w2, size, size, rhs_cols, cores);
+	  if (pseudoInverse) {
+	    for (k=0; k<tot; ) {
+	      for (Long i=0; i<size; i++) {
+		double cmp = w2[k];
+		w2[k] *= w3[i];
+		if (FABS(w2[k] * D[i] - cmp) > eigen2zero) {
+		  GERR0("singular matrix problem does not have a solution");
+		}
+		k++;
+	      }
+	    }
+	  } else {
+	    for (k=0; k<tot; )
+	      for (Long i=0; i<size; i++) w2[k++] *= w3[i];
+	  }
+	  matmult(U, w2, RESULT, size, size, rhs_cols,cores);
+	} else {
+	  if (pseudoInverse && !sp->pseudoinverse)
+	    GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'");
+	  C_MALLOC(Long, w2, sizeSq, double);
+	  for (Long kk=0, j=0; j<size; j++) {
+	    double dummy = w3[j];
+	    for (Long i=0; i<size; i++, kk++) w2[kk] = U[kk] * dummy;
+	  }
+	  matmult_2ndtransp(w2, U, RESULT, size, size, size); // V * U^T
+	}
+      }
+
+      if (PL >=  PL_DETAILSUSER) {
+	PRINTF("eigen value decomposition successful\n");
+      }
+      break;
+    }
+	
+    case SVD : {// SVD : M = U D VT
+      if (size > sp->max_svd) CERR0("matrix too large for SVD decomposition.");
+      int 
+	lw2 = -1,
+	size8 = size * 8;
+      double  optim_lwork,
+	eigen2zero = sp->eigen2zero,
+	*pt_w2 = &optim_lwork;
+      
+      C_MALLOC(Long, w3, sizeSq, double);
+      C_MALLOC(Long, U, sizeSq, double);
+      CMALLOC(D, size, double); 
+      CMALLOC(iwork, size8, int);
+      CMALLOC(lnz, size, double);
+       
+      for (Long i=0; i<=1; i++) {
+	F77dgesdd("A", &size, &size, // SVD
+			 MPT, &size, D, U, &size, w3, &size, 
+			 pt_w2, &lw2, iwork, &err
+#ifdef USE_FC_LEN_T	  
+		  FCONE
+#endif		  
+		  );
+	if (i==1 || err != NOERROR || ISNAN(D[0])) break;
+	lw2 = (int) optim_lwork;
+	CMALLOC(w2, lw2, double);
+	pt_w2 = w2;
+      }
+      if (err != NOERROR) {
+	if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dgesdd' = %d\n", err);}
+	CERR1("'dgesdd' failed with err=%d.", err);
+	break;
+      }
+
+      if (calculate == MATRIXSQRT) {
+	double svdtol = sp->svd_tol;
+	/* calculate SQRT of covariance matrix */
+ 	for (Long j=0, k=0; j<size; j++) {
+	  double dummy;
+	  if (D[j] < -eigen2zero) CERR0("negative eigenvalues found.");
+	  dummy = 0.0;
+	  if (D[j] >= eigen2zero) dummy = SQRT(D[j]);
+	  for (Long i=0; i<size; i++, k++) RESULT[k] = U[k] * dummy;
+	}
+ 
+	/* check SVD */
+ 	if (svdtol > 0.0) {
+	  for (Long i=0; i<size; i++) {
+	    double *Ui = RESULT + i;
+	    for (Long k=i; k<size; k++) {
+	      double *Uk = RESULT + k,
+		sum = 0.0;
+	      for (Long j=0; j<sizeSq; j+=size) {
+		sum += Ui[j] * Uk[j];
+	      }
+	      
+	      if (FABS(M0[i * size + k] - sum) > svdtol) {
+		if (PL > PL_ERRORS) {
+		  PRINTF("difference %10e at (%ld,%ld) between the value (%10e) of the covariance matrix and the square of its root (%10e).\n", 
+			 M0[i * size +k] - sum, i, k, M0[i*size+k], sum);
+		}
+		FERR3("required precision not attained  (%10e > %10e): probably invalid model. See also '%.50s'.", FABS(M0[i * size + k] - sum), svdtol,
+		      solve[SOLVE_SVD_TOL]);
+
+		err=ERRORM;
+		break;
+	      } //else printf("ok (%d,%d) %10g %10g\n", i, k, M0[i*size+k],sum);
+	    }
+	    if (err != NOERROR) break;		
+	  }
+	  if (err != NOERROR) break;		
+	} // end if svdtol > 0
+
+      } else {
+        // calculate determinant 
+        if (logdet != NULL) {
+	  *logdet = cumProd(D, size, sp->det_as_log);
+	  if (calculate == DETERMINANT) {
+	    err = NOERROR;
+	    goto ErrorHandling;
+	  }
+	}
+ 	
+  	bool pseudoInverse = false;
+	for (Long j=0; j<size; j++) {
+	  bool small = FABS(D[j]) < eigen2zero;
+	  pseudoInverse |= small;
+	  lnz[j] = small ? 0.0 : 1.0 / D[j];
+	}
+	
+	if (rhs_cols > 0) {
+	  Long tot = (Long) size * rhs_cols;
+	  C_MALLOC(Long, w2, tot, double);	
+	  matmulttransposed(U, RHS, w2, size, size, rhs_cols,cores);
+	  if (pseudoInverse) {
+	    for (Long k=0; k<tot; ) {
+	      for (Long i=0; i<size; i++) {
+		double cmp = w2[k];
+		w2[k] *= lnz[i];
+		if (FABS(w2[k] * D[i] - cmp) > eigen2zero) {
+		  GERR0("singular matrix problem does not have a solution.");
+		}
+		k++;
+	      }
+	    }
+	  } else {
+	  for (Long k=0; k<tot; )
+	    for (Long i=0; i<size; i++) w2[k++] *= lnz[i];
+	  }
+	  matmulttransposed(w3, w2, RESULT, size, size, rhs_cols,cores);
+	} else {
+	  // calculate inverse of covariance matrix
+	  if (pseudoInverse && !sp->pseudoinverse)
+	    GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'");
+	  for (Long k=0, j=0; j<size; j++) {
+	    double dummy = lnz[j];
+	    for (Long i=0; i<size; i++) U[k++] *= dummy;
+	  }
+	  matmult_tt(U, w3, RESULT, size, size, size, cores); // V * U^T
+	}
+      }
+
+      if (PL >=  PL_DETAILSUSER) { PRINTF("svd successful\n"); }
+      break;
+    }
+
+    case LU : {// LU
+      //printf("LU\n");
+      if (calculate == MATRIXSQRT) {
+	err = ERRORFAILED;
+	continue;
+      }
+      
+      CMALLOC(xja, size, int);		    
+      F77dgetrf(&size, &size, MPT, &size, xja, &err);
+      if (err != NOERROR) {
+	CERR1("'dgetrf' (LU) failed with err=%d.", err);
+      }
+ 
+      //printf("LU %d\n", logdet != NULL);
+
+      
+      if (logdet != NULL) {
+	*logdet =  DeterminantLU(MPT, size, sp->det_as_log, xja);
+        if (calculate == DETERMINANT) {
+	  err = NOERROR;
+	  goto ErrorHandling;
+	}
+      }
+
+      if (rhs_cols > 0) {
+	if (rhs_cols > MAXINT) BUG;
+	int rhs_cols0 =  (int) rhs_cols;
+	Long totalRHS = (Long) size * rhs_cols;
+	if (result != NULL) MEMCOPY(RESULT, RHS, sizeof(*RESULT) * totalRHS);
+	F77dgetrs("N", &size, // LU rhs
+			 &rhs_cols0, MPT, &size, xja, 
+			 RESULT, &size, &err
+#ifdef USE_FC_LEN_T		  
+		  FCONE
+#endif		  
+		  );
+	if (err != NOERROR) {	
+	  CERR1("'dgetrs' (LU) failed with err=%d.", err);
+	}
+      } else {
+	int lw2 = -1;
+	double dummy,
+	  *p = &dummy;
+	for (int i=0; i<=1; i++) { 
+	  F77dgetri(&size, MPT, // LU solve
+			   &size, xja, p, &lw2, &err);	
+	  if (err != NOERROR) break;
+	  lw2 = (int) dummy;
+	  CMALLOC(w2, lw2, double);
+	  p = w2;
+	}
+      }      
+      if (PL >=  PL_DETAILSUSER) { PRINTF("LU decomposition successful\n"); }
+      break;
+    }
+	
+    case Sparse : {// sparse matrix
+#if defined compatibility_to_C_h
+      BUG;
+#else
+      if (sizeSq > MAXINT) BUG;
+      
+      Long halfsq = (Long) size * (size + 1) / 2;
+      int nnzlindx = -1, 
+	doperm = sp->pivotsparse,
+	nnzcolindices = 0,
+	nnzR = 0,
+	cache = 512, // to do: CPU cache size
+	nnzcfact[3] = { 5, 1, 5 }, 
+	nnzRfact[3] = { 5, 1, 2 };
+      double
+	cholincrease_nnzcol = 1.25,
+	cholincrease_nnzR = 1.25;
+
+      if (!posdef) CERR0("'spam' needs a positive definite matrix.");
+      CMALLOC(pivotsparse, size, int);
+      if (!doperm) for (int i=0; i<size; i++) pivotsparse[i] = i + 1;
+
+      if (spam_zaehler == 0) { 
+	for (Long i=0; i<sizeSq; i++) nnzA += !(FABS(M0[i]) <= spam_tol);
+	spam_zaehler = nnzA + 1; // falls nur aus Nullen bestehend
+      }
+
+      CMALLOC(xlnz, sizeP1, int);
+      CMALLOC(snode, size, int);
+      CMALLOC(xsuper, sizeP1, int);
+      CMALLOC(iwork, sizeP1, int);
+      CMALLOC(invp, size, int);
+      CMALLOC(w3, size, double);
+
+      CMALLOC(cols, spam_zaehler, int);
+      CMALLOC(rows, sizeP1, int);
+   
+      int nD = spam_zaehler;
+      if (nD < size) nD = size;
+      CMALLOC(D, nD, double);
+      // prepare spam
+
+      F77spamdnscsr(&size, &size, M0, &size, D,
+			   cols, // ja
+			   rows, // ia
+			   &spam_tol); // create spam object   
+      pt->nsuper = 0;
+      // calculate spam_cholesky
+      err = 4; // to get into the while loop
+      while (err == 4 || err == 5) {
+	if (nnzcolindices == 0) {
+	  double rel = nnzA / (double) size;
+	  if (rel < 5) {
+	    nnzcolindices = (int) CEIL(nnzA * (1.05 * rel - 3.8));
+	    if (nnzcolindices < 1000) nnzcolindices = 1000;
+	  } else {
+	    nnzcolindices = nnzA;
+	  }
+	  nnzcolindices *= nnzcfact[doperm];
+	  if (nnzcolindices < nnzA) nnzcolindices = nnzA;
+	} else if (err == 5) {
+	  int tmp = (int) CEIL(nnzcolindices * cholincrease_nnzcol);
+	  if (PL > PL_RECURSIVE) {
+	    PRINTF("Increased 'nnzcolindices' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR);
+	  }
+	  nnzcolindices = tmp;
+	}
+	if (nnzcolindices < pt->n_lindx) nnzcolindices = pt->n_lindx;
+	
+	if (nnzR == 0) {
+	  double u = FLOOR(.4 * POW(nnzA, 1.2));
+	  u = u < 4 * nnzA ? 4 * nnzA : CEIL(u);
+	  nnzR = (int) u * nnzRfact[doperm];
+	} else if (err == 4) {
+	  int tmp = (int) CEIL(nnzR * cholincrease_nnzR);
+	  if (PL > PL_RECURSIVE) {
+	    PRINTF("Increased 'nnzR' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR);
+	  }
+	  nnzR = tmp;
+	}
+	if (nnzR < pt->n_lnz) nnzR = pt->n_lnz;
+	else if (nnzR > halfsq) nnzR = (int) halfsq;	
+	
+	CMALLOC(lindx, nnzcolindices, int);	
+	CMALLOC(lnz, nnzR, double);
+	 	
+	F77cholstepwise(&size, &nnzA, D, cols, rows, &doperm,
+			       invp, pivotsparse, 
+			       &nnzlindx, &nnzcolindices, 
+			       lindx, // 
+			       iwork,// 
+			&(pt->nsuper), // length of lindx
+			       &nnzR,  // physical length of lindx
+			       lnz,   // output:result
+			       xlnz,  // cols of lnz "ja"
+			       snode,  // supernode membership ??
+			       xsuper, // supernode partioning
+			&cache, // cache size of the CPU
+			&err
+			);       
+	
+	if (err != NOERROR) {
+	  CERR1("'cholstepwise' failed with err=%d.", err);
+	  break;
+	}	 
+      } // while
+      
+      if (err != NOERROR) CERR0("'spam' failed.");
+      if (PL >=  PL_DETAILSUSER) { PRINTF("'spam' successful\n"); }
+      
+      // spam solve
+      
+      if (calculate == MATRIXSQRT) {
+	
+	//BUG; // unexpected behaviour in spam
+	
+	nnzR = xlnz[size] - 1;
+	CMALLOC(xja, nnzR, int);
+	F77calcja(&size, &(pt->nsuper), pt->xsuper, 
+		  pt->lindx, pt->iwork, pt->xlnz, xja);
+	for (Long i=0; i<size; invp[i++]--); 
+	F77spamcsrdns(&size, pt->lnz, xja, pt->xlnz, RESULT);
+	for (Long i=0; i<size; i++) {
+	  Long endfor = (i + 1) * size;
+	  for (Long j = i * (size + 1) + 1; j<endfor; RESULT[j++]=0.0);
+	}
+      } else {     
+	double *lnz = pt->lnz;
+	int *lindx = pt->lindx;
+	
+	// spam determinant
+	if (logdet != NULL) {
+	  if (sp->det_as_log) {
+	    double tmp = 0.0;
+	    for (Long i=0; i<size; i++) tmp += LOG(lnz[xlnz[i] - 1]);	  
+	    *logdet = 2.0 * tmp;
+	  } else {
+	    double tmp = 1.0;
+	    for (Long i=0; i<size; i++) tmp *= lnz[xlnz[i] - 1] ;
+	    *logdet = tmp * tmp;
+	  }
+	  if (calculate == DETERMINANT) {
+	    err = NOERROR;
+	    goto ErrorHandling;
+	  }
+	}
+	
+	/*   z = .Fortran("backsolves", m = nrow,
+	     nsuper, p, a@colindices,
+	     a@colpointers, as.double(a@entries),
+	     a@rowpointers, a@invpivot, a@pivot,
+	     a@supernodes, vector("double",nrow),
+	     sol = vector("double",nrow*p),
+	     as.vector(b,"double"),
+	     NAOK = .Spam$NAOK,PACKAGE = "spam")$sol
+	*/
+	int RHS_COLS;
+	if (rhs_cols <= 0) { // UNBEDINGT VOR double *RHS;
+	  RHS_COLS = size;	
+	  FILL_IN(RESULT, sizeSq, 0.0);
+	  for (Long i=0; i<sizeSq; i += sizeP1) RESULT[i] = 1.0; 
+	} else {
+	  if (rhs_cols > MAXINT) BUG;
+	  RHS_COLS = (int) rhs_cols;	
+	  if (result != NULL) 
+	    MEMCOPY(RESULT, RHS, (Long) size * rhs_cols * sizeof(*RESULT));
+	}
+	
+	//printf("nsuper=%d\n", pt->nsuper);
+	//	  for (Long ii=0; ii<size; ii++) 
+	//printf("%d %d %d %d %10e\n", ii, pt->nsuper, sizeP1, xsuper[ii],
+	//	   w3[ii]);
+	
+	//	  if (false)
+	//	  for (Long jsub=0; jsub<=pt->nsuper; jsub++) {
+	//	    int fj = xsuper[1 - 1],
+	//	      Lj = xsuper[jsub + 1 - 1] -1;
+	//	    printf("%d %d %d\n", jsub, fj, Lj);
+	//	    for (Long jcol=fj; jcol <= Lj; jcol++) {
+	//	      printf("%d,%10g  ", jcol, w3[jcol -  1]);
+	//	    }	    
+	//	  }
+	
+	//	  for (Long jcol=1; jcol <= 600; jcol++) {
+	//	    w3[jcol - 1] = jcol;
+	//   printf("%d,%10g  ", jcol, w3[jcol -  1]);
+	//  }	    
+	
+	
+	//	  printf("%ld %ld %d\n", RESULT, rhs, rhs_cols);
+	//	  for (Long ii=0; ii<size; ii++) printf("%d %10e\n", ii, RESULT[ii]);
+	//	  BUG;
+	
+	F77backsolves(&size, &(pt->nsuper), &RHS_COLS, 
+			     lindx, // colindices
+			     iwork, //colpointers
+			     lnz, 
+			     xlnz, //  rowpointers
+			     invp, pivotsparse,
+			     xsuper, // supernodes
+			     w3, RESULT);	
+	if (PL >=  PL_DETAILSUSER) { PRINTF("'spam' successful\n"); }
+      }         
+      break;
+#endif 
+   } // Sparse
+
+    case NoInversionMethod: GERR0("no inversion method given.");
+    case NoFurtherInversionMethod:
+      STRCPY(ErrStr, WHICH_ERRORSTRING);
+      GERR1("%.300s (All specified matrix inversion methods have failed.)",
+	    ErrStr);
+    case direct_formula: case Diagonal: 
+      GERR1("strange method appeared:%.200s.", CONTACT);
+    default : GERR1("unknown method (%d) in 'RandomFieldsUtils'.", pt->method);
+    } // switch
+
+    if (err==NOERROR) break;
+  } // for m
+
+ 
+ ErrorHandling:
+  if (Pt == NULL) solve_DELETE(&pt);      
+  else Pt->sparse = sparse;
+  
+  //  if (err != NOERROR) { printf("Err = %s %s %d\n",  ErrStr, WHICH_ERRORSTRING ,err); exit(0);}
+  
+  return err; //  -method;
+}
+
+ 
+
+
+int SolvePosDef(double *M, int size, bool posdef, 
+		double *rhs, Long rhs_cols, double *logdet, 
+		solve_storage *PT,
+		int VARIABLE_IS_NOT_USED cores) {
+  if ((rhs == NULL) xor (rhs_cols == 0)) BUG; 
+  return doPosDefIntern(M, size, posdef, rhs, rhs_cols,
+		  NULL, // result, so result returned in M or rhs
+		  logdet, 
+		  SOLVE, // calculate
+		  PT, // storage
+		  &(OPTIONS.solve), cores);
+}
+
+
+int SolvePosDefSp(double *M, int size, bool posdef, 
+		  double *rhs, Long rhs_cols,  double *logdet, 
+		  solve_storage *PT,  solve_options * sp, int VARIABLE_IS_NOT_USED cores) {  
+  if ((rhs == NULL) xor (rhs_cols == 0)) BUG; 
+  return doPosDefIntern(M, size, posdef, rhs, rhs_cols, NULL, logdet, SOLVE,
+		  PT, sp, cores);
+}
+
+
+
+
+int xCinvYdet(double *M, int size, bool posdef, double *X, double *Y,
+	      Long cols, double *XCinvY, double *det, bool log,
+	      solve_storage *PT, int VARIABLE_IS_NOT_USED cores) {
+  // called by randomfields
+  int NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX;
+  bool pt = PT != NULL && PT->result != NULL;
+  double *result;
+  if (pt) result=PT->result;
+  else result= (double *) MALLOC(sizeof(*result) * (Long) size * cols);  
+  if (result == NULL) return ERRORMEMORYALLOCATION;
+  double *res = result;
+  solve_options sp;
+  MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options));
+  sp.det_as_log = log;
+  int err =  doPosDefIntern(M,// no PROTECT( needed
+		      size, posdef, Y, cols, result, det, SOLVE, PT, &sp,
+		      cores);
+  for (Long i=0; i<cols; i++, res += size, X += size)
+    XCinvY[i] = SCALAR(res, X, size);
+  if (!pt) { FREE(result); }
+  return err;
+}
+
+int xCinvXdet(double *M, int size, double *X, Long X_cols,
+	      double *XCinvX, double *det, bool log, solve_storage *PT,
+	      int VARIABLE_IS_NOT_USED cores) {
+  // called by randomfields
+  return xCinvYdet(M, size, true, X, X, X_cols, XCinvX, det, log, PT, cores);
+}
+
+
+double DetPosDefsp(double *M, int size, solve_options *sp, int VARIABLE_IS_NOT_USED cores) {
+  double det;
+   int err= doPosDefIntern(M, size, true, NULL, 0, NULL, &det,// no PROTECT( needed
+		     DETERMINANT, NULL, sp, cores);
+  if (err != NOERROR)
+    ERR0("error occurred when calculating determinant of a pos def matrix.");
+  return det;  
+}
+
+double DetPosDef(double *M, int size, int VARIABLE_IS_NOT_USED cores) {
+  //never log of det --always small sizes!!
+  //not that M will be destroyed!!
+  solve_options sp;
+  MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options));
+  sp.det_as_log = false;
+  return DetPosDefsp(M, size, &sp, cores);
+}
+
+
+int InvertMatrix(double *M, int size, int VARIABLE_IS_NOT_USED cores) {
+  return doPosDefIntern(M, size, false, NULL, 0, NULL, NULL, SOLVE, NULL, NULL,
+		  cores);
+}
+
+
+
+
+int chol(double *M, int size, solve_options *sp, int VARIABLE_IS_NOT_USED cores) {
+  return doPosDefIntern(M, size, true, NULL, 0, NULL, NULL, MATRIXSQRT, NULL, sp,
+		  cores);   
+}
+
+int cholesky(double *M, int size, int VARIABLE_IS_NOT_USED cores) {
+  solve_options sp;
+  MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options));
+  sp.Methods[0] = sp.Methods[1] = Cholesky;
+  sp.sparse = False; // currently does not work, waiting for Reinhard
+  return chol(M, size, &sp, cores);   
+}
+
+
+bool Is_positive_definite(double *C, int dim, int VARIABLE_IS_NOT_USED cores) { // bool not allowed in C
+  int err;
+  Long bytes = sizeof(*C) * dim * dim;
+  double *test;
+  test = (double*) MALLOC(bytes);
+  MEMCOPY(test, C, bytes);
+  err = cholesky(test, dim, cores);
+  //  printf("errr = %d\n", err);
+  UNCONDFREE(test);
+  return(err == NOERROR);
+}
+
+
+
+/*
+
+  ## extrem wichter check -- folgendes funktioniert bislang bei spam nicht:
+  library(RandomFields, lib="~/TMP")
+  RFoptions(printlevel = 3, pch="", seed=999, use_spam = TRUE) #//
+  z = RFsimulate(RMspheric(), x, max_variab=10000, n=10000, spC=F ALSE)
+  C = cov(t(z))
+  c = RFcovmatrix(RMspheric(), x) #//
+  p rint(summary(as.double(c - C))) ##//
+  stopifnot(max(a b s(c-C)) < 0.05)
+
+*/
+
+
+int SqrtPosDefFree(double *M,  // in out
+		   int size,  
+		   solve_storage *pt,     // in out
+		   solve_options *sp,
+		   int VARIABLE_IS_NOT_USED cores
+		   ){
+  ASSERT_SOLVE(sp);
+  int err;
+  Long sizeSq = (Long) size * size;
+  InversionMethod *Meth = sp->Methods;
+  double *res = NULL;
+  bool extra_alloc = Meth[0] == NoInversionMethod ||
+    Meth[0] == NoFurtherInversionMethod ||
+    (Meth[1] != NoInversionMethod && Meth[1] != NoFurtherInversionMethod &&
+     Meth[1] != Meth[0]) ||
+    (Meth[0] != Cholesky && Meth[0] != Eigen && Meth[0] != SVD);
+  assert(pt != NULL);
+
+  if (sp->sparse == True) 
+    warning("package 'spam' is currently not used for simulation");
+  sp->sparse = False;
+
+  if (extra_alloc) {
+    C_MALLOC(Long, result, sizeSq, double);
+    res = result;
+  } else {
+    FREE(pt->result);
+    pt->result = M;
+    pt->n_result = sizeSq;
+  }
+  
+  // it is ok to have
+  // ==15170== Syscall param sched_setaffinity(mask) points to unaddressable byte(s)
+  // caused by gcc stuff
+  err = doPosDefIntern(M, size, true, NULL, 0, res, NULL, MATRIXSQRT, pt, sp,
+		 cores);// no PROTECT( needed
+  
+  if (extra_alloc) {  
+
+
+#if defined MSDOS_WINDOWS
+    pt->to_be_deleted = M;
+#else
+      FREE(M);
+#endif
+  }
+  return err;
+}
+
+
+void sqrtRHS_Chol(double *U, int size, double* RHS, Long RHS_size, Long n,
+		  double *result,
+		  bool pivot, int act_size,
+		  int cores,
+		  int *pi) {
+  //  printf("n=%d,rhss=%d si=%d pivot=%d, act=%d U=%d  RHS=%d %d pi=%d\n",
+  //	 n,  RHS_size, size,pivot,  act_size, U!=NULL, RHS!=NULL,   result!=NULL, pi!=NULL );
+//  for (Long i=0; i<size; i++) printf("%d ", pi[i]); printf("\n");
+
+  Long nsize = n * (Long) size;
+  int NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX;
+  assert(U != NULL);
+  if (pivot){
+    Long n_act_size = n * (Long) act_size,
+      diff = size - act_size,
+      n_diff = nsize - n_act_size;
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (MULTIMINSIZE(n_act_size)) schedule(dynamic, 8) 
+#endif
+    for (Long i=0 ; i<n_act_size; i++) {
+      Long ii = i % act_size,
+	j = i / act_size;
+      result[pi[ii] + j * size] = SCALAR(RHS + j * RHS_size,
+					 U + pi[ii] * (Long) size, ii + 1);
+    }
+    pi += act_size;
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) schedule(static) if (MULTIMINSIZE(n_diff))
+#endif
+    for (Long i=0; i<n_diff; i++) {
+      Long ii = pi[i % diff],
+	j = i / diff;
+      result[ii + j * size] = SCALAR(RHS + j * RHS_size, U + ii * size,
+				     act_size);
+    }
+  } else {
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) if (MULTIMINSIZE(nsize)) schedule(dynamic, 8) 
+#endif
+    for (Long i=0; i<nsize; i++) {
+      Long ii = i % size,
+	j = (i / size) * size;
+      result[ii + j] = SCALAR(RHS + j, U + ii * size, ii + 1);
+    }
+  }
+}
+
+
+int sqrtRHS(solve_storage *pt, double* RHS, double *result, int cores){
+  // Multipliziert die Cholesky_Zerlegung auf einen Vektor
+  // doppelte Geschwindigkeit der Dreieckmatrix wird hier genutzt
+  // (und dass der andere Teil der Matrix nicht wohl definiert ist)
+  assert(pt != NULL);
+  int size = pt->size;
+  switch (pt->method) {
+  case Rcholesky : {
+    //printf("RchoL\n");
+    int incx = 1;
+    MEMCOPY(result, RHS, (Long) size * sizeof(*result));
+    F77dtrmv("U", "T", "N", &size, pt->result, &size, result, &incx
+#ifdef USE_FC_LEN_T	  
+		  FCONE FCONE FCONE
+#endif		  
+	     );
+  }
+    break;
+  case GPUcholesky :
+    // Alex: Gegebenenfalls schnelle GPU Version von Deiner Seite
+    
+  case direct_formula : 
+  case Cholesky : {
+    //    printf("intern\n");
+      bool pivot = (pt->actual_pivot == PIVOT_DO ||
+		  pt->actual_pivot == PIVOT_IDX) &&
+      pt->method != direct_formula;
+    if (pivot && pt->n_pivot_idx != size) BUG;
+    
+    sqrtRHS_Chol(pt->result, size, RHS, size, 1, result, pivot,
+		 pivot ? pt->actual_size : NA_INTEGER, cores,
+		 pt->pivot_idx);
+    return NOERROR;
+  }
+  case SVD : case Eigen : {  
+    double *U = pt->result;
+    assert(U != NULL);
+#ifdef DO_PARALLEL
+#pragma omp parallel for num_threads(cores) schedule(static) if (MULTIMINSIZE(size))
+#endif
+    for (Long i=0; i<size; i++){
+      double dummy = 0.0;
+      Long k = i;
+      for (Long j=0; j<size; j++, k+=size) dummy += U[k] * RHS[j];
+      result[i] = (double) dummy; 
+    }
+  }
+    break;
+
+  case Sparse : {
+    BUG; // SEE ALSO solve, calculate, tmp_delete !!
+    int one = 1;
+    assert(pt->D != NULL);
+    F77amuxmat(&size, &size, &one, RHS, pt->D, pt->lnz, 
+		      pt->xja, pt->xlnz);
+    for (Long i=0; i<size; i++) result[i] = pt->D[pt->invp[i]];
+  }
+    break;
+
+  case Diagonal : {  
+    Long  i, j,
+      sizeP1 = size + 1;
+    double *D = pt->result;
+    assert(D != NULL);
+    for (i=j=0; j<size; j++, i+=sizeP1) result[j] = RHS[j] * D[i];
+  }
+    break;
+    
+  default :
+    // printf("pt->method %d\n", pt->method);
+    BUG;
+  }
+  
+  return NOERROR;
+}
diff --git a/src/miraculix/transformUint.cc b/src/miraculix/transformUint.cc
index 0ec23b36..66f4ea99 100755
--- a/src/miraculix/transformUint.cc
+++ b/src/miraculix/transformUint.cc
@@ -466,7 +466,6 @@ void transform(unit_t *Old, Long OldSnps, Long OldIndiv, Long OldLDA,
   tmp = (unit_t*) algn_generalL( (int*)(tmp##_unaligned = (unit_t*)	\
 				      CALLOC((inUnits)+safety, BytesPerUnit)), \
 			       MAX_LDA_BITALIGN )
- 
 
   if (isHaplo(OldCoding)) {
     // printf("!!haplo\n");
@@ -987,6 +986,7 @@ void transposeIntern(unit_t * Code, Long snps, Long individuals,
     //    printf("k=%d %d\n", k, rest_indiv > 0);
     if (k) {
       zeros = (unit_t*) CALLOC(real_units[0] + 1, BytesPerUnit); //real_units=[0]=0 moeglich!
+      assert(zeros != NULL);
       Code0 = Code + Lda * indiv ;
       Code1 = codePerByte > 1 ? Code0 + Lda : zeros;
       Code2 = codePerByte > 2 ? Code1 + Lda :  zeros;
diff --git a/src/miraculix/xport_import_rrfu.cc b/src/miraculix/xport_import_rrfu.cc
new file mode 100755
index 00000000..7ef2d65b
--- /dev/null
+++ b/src/miraculix/xport_import_rrfu.cc
@@ -0,0 +1,51 @@
+/*
+ Authors 
+ Martin Schlather, martin.schlather@uni-mannheim.de
+
+ Copyright (C) 2022-2023 Martin Schlather 
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+
+
+#include "Basic_RFUlocal.h"
+#include "extern_RFU.h"
+
+ 
+void startRFU() {
+  // utilsoption_type *utils = &OPTIONS;
+  union {
+    unsigned short a;  
+    unsigned char b[2];
+  } ab;
+  ab.a = 0xFF00;
+  bool bigendian = ab.b[0] != 0;
+  assert((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) xor bigendian);
+  OPTIONS.basic.bigendian = bigendian;
+  
+  volatile uni64 U64;
+  U64.u32[bigendian] = 1954;
+  U64.u32[!bigendian] = 0x7ff00000;
+  OPTIONS.basic.NA = U64.d8[0];
+  OPTIONS.basic.NaN = 0.0 / 0.0;
+
+#if defined compatibility_to_C_h  
+  ownNA = OPTIONS.basic.NA;
+  ownNaN = OPTIONS.basic.NaN;
+#endif  
+
+}
+
+
+
diff --git a/tests/dgemm_compressed/test.jl b/tests/dgemm_compressed/test.jl
index b28d1c95..a9f84169 100644
--- a/tests/dgemm_compressed/test.jl
+++ b/tests/dgemm_compressed/test.jl
@@ -35,7 +35,7 @@ include(MODULE_PATH)
 println("Load library and set options")
 miraculix.dgemm_compressed.set_library_path(LIBRARY_PATH)
 miraculix.dgemm_compressed.load_shared_library()
-miraculix.dgemm_compressed.set_options(use_gpu=!false)
+miraculix.dgemm_compressed.set_options(use_gpu=false, verbose=0)
 
 println("Read bed file and frequencies")
 genotype_data, n_snps, n_indiv = miraculix.read_plink.read_bed(DATA_FILE)