StevenLShafer · daattali · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+*.Rproj
+.Renviron
diff --git a/Global.R b/Global.R
@@ -1,108 +1,46 @@
 # Global.R
 library(shiny)
-library(openxlsx)
-library(readxl)
 library(metap)
 library(Rfast)
 library(shinyjs)
 library(rsconnect)
-library(shinyWidgets)
 #library(mirai)
 #library(promises)
-library(future)
-library(foreach)
-library(doParallel)
 library(MBESS)
 library(dqrng)
 library(OpenMx)
 library(digitTests)
 library(rsconnect)
 library(bslib)
 library(shinydashboard)
+library(future)
+library(foreach)
 library(doFuture)
-
-registerDoFuture(flavor = "%dofuture%")
-plan(multisession)
-
 #library(BiocManager)
 #BiocManager::install("multtest")
 
-remove(list=ls())
-
-DATA <- NULL
-Results <- NULL
-LineNumber <- 1
-TRIALS <- NULL
-LengthTrials <- NULL
-priorMessage <- NULL
-ColumnNames <- NULL
-CategoryNames <- NULL
-
-# m is the replication number for the Monte Carlo simulation
-m <- 100000
-m <- 15000
+RUN_PARALLEL <- FALSE
 
+if (RUN_PARALLEL) {
+  calculate_workers <- function() {
+    total_cores <- parallelly::availableCores()
 
-############################################################################
-# References                                                               #
-# Carlisle JB. The analysis of 168 randomised controlled trials to test    #
-# data integrity. Anaesthesia. 2012;67:521-537.                            #
-#                                                                          #
-# Carlisle JB, Dexter F, Pandit JJ, Shafer SL, Yentis SM. Calculating the  #
-# probability of random sampling for continuous variables in submitted or  #
-# published randomised controlled trials. Anaesthesia. 2015;70:848-58.     #
-#                                                                          #
-# Carlisle JB. Data fabrication and other reasons for non-random sampling  #
-# in 5087 randomised, controlled trials in anaesthetic and general medical #
-# journals. Anaesthesia. 2017;72:944-952                                   #
-############################################################################
-
-is_local <- Sys.getenv('SHINY_PORT') == ""
-if (is_local)
-  setwd("g:/projects/Fraud/2025")
-
-outputComments <- function(
-    ...,
-    echo = getOption("ECHO_OUTPUT_COMMENTS", TRUE),
-    sep = " ")
-{
-  isolate({
-    argslist <- list(...)
-    if (length(argslist) == 1) {
-      text <- argslist[[1]]
+    if (total_cores <= 2) {
+      return(1)
+    } else if (total_cores <= 4) {
+      return(total_cores - 1)
+    } else if (total_cores <= 8) {
+      return(total_cores - 2)
     } else {
-      text <- paste(argslist, collapse = sep)
+      return(ceiling(total_cores * 0.6))
     }
-
-    # If this is called within a shiny app, try to get the active session
-    # and write to the session's logger
-    commentsLog <- function(x) invisible(NULL)
-    session <- getDefaultReactiveDomain()
-    if (!is.null(session) &&
-        is.environment(session$userData) &&
-        is.reactive(session$userData$commentsLog))
-    {
-      commentsLog <- session$userData$commentsLog
-    }
-
-    if (is.na(echo)) return()
-    if (is.data.frame((text)))
-    {
-      con <- textConnection("outputString","w",local=TRUE)
-      capture.output(print(text, digits = 3), file = con, type="output", split = FALSE)
-      close(con)
-      if (echo)
-      {
-        for (line in outputString) cat(line, "\n")
-      }
-      for (line in outputString) commentsLog(paste0(commentsLog(), "<br>", line))
-    } else {
-      if (echo)
-      {
-        cat(text, "\n")
-      }
-      commentsLog(paste0(commentsLog(), "<br>", text))
-    }
-  })
+  }
+
+  plan(multisession, workers = calculate_workers())
 }
 
+# replication number for the Monte Carlo simulation
+MONTE_CARLO_SIM_REPS <- 15000
+
+COMMON_COL_NAMES <- c("TRIAL", "ROW", "N", "MEAN", "SD",  "ROUND_MEAN", "ROUND_OBSERVATION")
+COLS_CONTINUOUS <- c("N", "MEAN", "SD")
diff --git a/Integrity Analysis.docx b/Integrity Analysis.docx
diff --git a/R/p_calc.R b/R/p_calc.R
@@ -0,0 +1,151 @@
+# Primary Statistical Function for Monte Carlo Simulation
+P_Calc <- function(trial_data, CategoryNames)
+{
+  data <- trial_data
+  TRIAL <- trial_data$TRIAL[1]
+  RowIDs <- unique(data$ROW)
+
+  x <- foreach(
+    j = 1:length(RowIDs),
+    .combine = rbind
+  ) %do%
+    {
+      Row <- RowIDs[j]
+      ROWS <- data[data$ROW == Row,]
+
+      # Greater than 1 line?
+      if (nrow(ROWS) > 1)
+      {
+        # Is this categorical?
+        if (all(!is.na(ROWS$N)))
+        {
+          COLS <- nrow(ROWS)
+          N <- sum(ROWS$N)
+          Meanmean <- sum(ROWS$N*ROWS$MEAN) / N
+          # The calculation of Meanvar is OK. SD^2 is an unbiased estimate
+          # of variance
+          Meanvar <-  sum(ROWS$N*ROWS$SD^2) / N
+
+          # However, this next calculatiion is biased. s.u. will correct it
+          # If N > 30, then the correction is < 1 %. It blows up if N > 343!
+          if (N < 30)
+          {
+            Meansd <- s.u(sqrt(Meanvar), N)
+          } else {
+            Meansd <- sqrt(Meanvar)
+          }
+          # Protect size of simulation
+          if ((MONTE_CARLO_SIM_REPS*N) < 1000000000) # One billion
+          {
+            m1 <- MONTE_CARLO_SIM_REPS
+          } else {
+            m1 <- 1000000000 / N
+          }
+          SEMsample <- Meansd/sqrt(mean(ROWS$N))
+          DiffSample <- sum((ROWS$MEAN - Meanmean)^2) # Squared difference of column means
+          # Monte Carlo Simulation
+          meansim <- dqrnorm(MONTE_CARLO_SIM_REPS,mean=Meanmean,sd=SEMsample) # Generate a new mean for each simulation
+          MonteCarloMean <- matrix(NA, nrow = m1, ncol = COLS) # I want one row for each simulation
+          # Need to do each column separately. Couldn't think of an efficient way to do this without
+          # a loop.
+          for (i in 1:COLS)
+            MonteCarloMean[,i] <-
+            round(
+              rowmeans(
+                round(
+                  # The matrix below will have one row for each replication (m rows),
+                  # and one column for each person (N[i] columns)
+                  # Cannot use dqrnorm because it won't support the array
+                  # of meansim needed for each replication
+                  matrix(
+                    rnorm(ROWS$N[i] * m1, rep(meansim, ROWS$N[i]), Meansd),
+                    nrow = m1, byrow = FALSE
+                  ),
+                  ROWS$ROUND_OBSERVATION[i]
+                )
+              ),
+              ROWS$ROUND_OBSERVATION[i]
+            )
+          N <- matrix(ROWS$N, nrow = m1, ncol = COLS, byrow = TRUE)
+          # Calculate the weighted mean, and then round
+          MeanSamples <- rowsums (MonteCarloMean * N) / sum(ROWS$N)
+          DiffSamples <- rowsums((MonteCarloMean - MeanSamples)^2)
+
+          PEQ <- sum(DiffSamples == DiffSample) / m1
+          PLE <- sum(DiffSamples < DiffSample)/m1 + PEQ
+          PGE <- sum(DiffSamples > DiffSample)/m1 + PEQ
+        } else {
+          ROWS <- ROWS[,CategoryNames]
+          for (NAME in CategoryNames)
+          {
+            if (all(is.na(ROWS[,NAME])))
+              ROWS[,NAME] <- NULL
+          }
+          PLE <- chisq.test(ROWS, simulate.p.value=MONTE_CARLO_SIM_REPS)$p.value
+          PGE <- 1-PLE
+        }
+        # Need to be sure P != 0 or 1
+        if(PLE == 1) PLE <- 0.999
+        if(PLE == 0) PLE <- 0.001
+        if(PGE == 1) PGE <- 0.999
+        if(PGE == 0) PGE <- 0.001
+        PLE = as.character(signif(PLE,4))
+        PGE = as.character(signif(PGE, 4))
+      } else {
+        PLE = "Only 1 Row"
+        PGE = NA
+      }
+      c(as.character(Row), PLE, PGE)
+    } %seed% TRUE
+
+  # This bizarre code is because if there is only 1 row, R creates a data.frame
+  # with 3 columns and 1 row.
+  if (length(x) == 3)
+  {
+    x <- as.data.frame(t(x))
+  } else {
+    x <- as.data.frame(x)
+  }
+
+  x <- cbind(NA, x)
+  x[1,1] <- TRIAL
+  x <- as.data.frame(x)
+  names(x) <- c("TRIAL", "ROW", "PLE", "PGE")
+  cat("Row IDs", RowIDs, "\n")
+  #print(x)
+  cat("match results", match(x$ROW, RowIDs), "\n")
+  x <- x[match(x$ROW, RowIDs),]
+  #print(x)
+
+  PLEvalues <- as.numeric(x$PLE)
+  PGEvalues <- as.numeric(x$PGE)
+
+  PLEvalues <- PLEvalues[!is.na(PLEvalues)]
+  PGEvalues <- PGEvalues[!is.na(PGEvalues)]
+
+  if (length(PLEvalues) > 1) {
+    PLE <- signif(sumz(PLEvalues)$p, 4)
+  } else if (length(PLEvalues) == 1) {
+    PLE <- PLEvalues
+  } else if (length(PLEvalues) == 0) {
+    PLE <- "No values"
+  }
+
+  if (length(PGEvalues) > 1) {
+    PGE <- signif(sumz(PGEvalues)$p,4)
+  } else if (length(PGEvalues) == 1) {
+    PGE <- PGEvalues
+  } else if (length(PGEvalues) == 0) {
+    PGE <- "No values"
+  }
+
+  lastline <- data.frame(
+    TRIAL = c(NA, NA),
+    ROW = c("Summary", NA),
+    PLE = c(as.character(PLE), NA),
+    PGE = c(as.character(PGE), NA)
+  )
+
+  x <- rbind(x, lastline)
+  return(x)
+}
diff --git a/R/utils.R b/R/utils.R
@@ -0,0 +1,88 @@
+read_input_file <- function(filepath) {
+  ext <- tools::file_ext(filepath)
+
+  result <- tryCatch({
+    if (ext == "csv") {
+      read.csv(filepath)
+    } else if (ext == "xlsx") {
+      readxl::read_xlsx(filepath)
+    } else if (ext == "xls") {
+      readxl::read_xls(filepath)
+    } else {
+      stop(".", ext, " is not a supported file type")
+    }
+  }, error = function(err) {
+    stop("Error reading file: ", err$message)
+  })
+
+  if (is.data.frame(result) && nrow(result) == 0) {
+    stop("File does not contain any data")
+  }
+
+  result
+}
+
+is_category <- function(x) {
+  # Remove NAs first for efficiency, then check if all values are integers
+
+  # If there are no na values, then it can't be a category
+  if (!any(is.na(x)))
+    return(FALSE)
+
+  # If the vector is empty after removing NAs then it is not a category
+  x_clean <- x[!is.na(x)]
+  if (length(x_clean) == 0)
+    return(FALSE)
+
+  # if there are non-numeric values, it is not a category
+  if (!is.numeric(x_clean))
+    return(FALSE)
+
+  # Check if all values are equal to their integer representation
+  all(x_clean == as.integer(x_clean))
+}
+
+outputComments <- function(
+    ...,
+    echo = getOption("ECHO_OUTPUT_COMMENTS", TRUE),
+    sep = " ")
+{
+  isolate({
+    argslist <- list(...)
+    if (length(argslist) == 1) {
+      text <- argslist[[1]]
+    } else {
+      text <- paste(argslist, collapse = sep)
+    }
+
+    # If this is called within a shiny app, try to get the active session
+    # and write to the session's logger
+    commentsLog <- function(x) invisible(NULL)
+    session <- getDefaultReactiveDomain()
+    if (!is.null(session) &&
+        is.environment(session$userData) &&
+        is.reactive(session$userData$commentsLog))
+    {
+      commentsLog <- session$userData$commentsLog
+    }
+
+    if (is.na(echo)) return()
+    if (is.data.frame((text)))
+    {
+      con <- textConnection("outputString","w",local=TRUE)
+      capture.output(print(text, digits = 3), file = con, type="output", split = FALSE)
+      close(con)
+      if (echo)
+      {
+        for (line in outputString) cat(line, "\n")
+      }
+      for (line in outputString) commentsLog(paste0(commentsLog(), "\n", line))
+    } else {
+      if (echo)
+      {
+        cat(text, "\n")
+      }
+      commentsLog(paste0(commentsLog(), "<br>", text))
+    }
+  })
+}