removed test metadata completeness feature maybe future dev

TNRiley · TNRiley · commit c174fcfbbd59 · 2025-12-11T13:45:29.000-05:00
diff --git a/inst/shiny-app/CiteSource/app2.R b/inst/shiny-app/CiteSource/app2.R
@@ -311,14 +311,6 @@ ui <- shiny::navbarPage("CiteSource",
                                   shiny::h5("Step 2: Double click on a column to edit sources, labels, and strings. Use *Ctrl+Enter* to save edits, one column at a time"),
                                   # Output: Data file ----
                                   DT::dataTableOutput("tbl_out"),
-                                  # Metadata completeness section - only show after upload
-                                  shiny::conditionalPanel(
-                                    condition = "output.upload_complete",
-                                    shiny::br(),
-                                    shiny::h5("Metadata Completeness by File"),
-                                    shiny::p("This heatmap shows the percentage of records in each file that have each metadata field populated. Fields used for matching duplicates are highlighted."),
-                                    plotly::plotlyOutput("completeness_heatmap", height = "600px")
-                                  )
                                 )
                               )
                             )
@@ -1006,346 +998,6 @@ server <- function(input, output, session) {
     }
   })
   
-  # Output to control visibility of completeness section
-  output$upload_complete <- shiny::reactive({
-    is.data.frame(rv$df) && nrow(rv$df) > 0
-  })
-  shiny::outputOptions(output, "upload_complete", suspendWhenHidden = FALSE)
-  
-  # Reactive to calculate metadata completeness by file
-  completeness_data <- shiny::reactive({
-    # Only calculate if upload is complete (rv$df has data)
-    if (!is.data.frame(rv$df) || nrow(rv$df) == 0) {
-      return(data.frame())
-    }
-    
-    # Use upload_df if available, otherwise use latest_unique (for reimported files)
-    data_to_use <- if (is.data.frame(rv$upload_df) && nrow(rv$upload_df) > 0) {
-      rv$upload_df
-    } else if (is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0) {
-      rv$latest_unique
-    } else {
-      return(data.frame())
-    }
-    
-    # Require that data exists
-    if (nrow(data_to_use) == 0) {
-      return(data.frame())
-    }
-    
-    # Define fields used for matching (key fields for deduplication)
-    matching_fields <- c(
-      "title", "author", "year", "journal", "abstract", 
-      "doi", "volume", "pages", "number", "isbn"
-    )
-    
-    # Calculate completeness for each file/source
-    completeness_list <- list()
-    
-    # Get unique sources (files) - handle comma-separated sources by separating them
-    # For upload_df, each record typically has one source, but we'll handle both cases
-    if ("cite_source" %in% names(data_to_use)) {
-      # Separate comma-separated sources if needed, but keep ALL columns using helper function
-      source_data_expanded <- data_to_use %>%
-        CiteSource:::expand_single_metadata_column("cite_source")
-      
-      unique_sources <- unique(source_data_expanded$cite_source)
-      unique_sources <- unique_sources[!is.na(unique_sources) & trimws(unique_sources) != ""]
-    } else {
-      return(data.frame())
-    }
-    
-    if (length(unique_sources) == 0) {
-      return(data.frame())
-    }
-    
-    for (source in unique_sources) {
-      # Filter data for this source (using expanded data where sources are already separated)
-      source_data <- source_data_expanded[source_data_expanded$cite_source == source, ]
-      
-      if (nrow(source_data) == 0) next
-      
-      total_records <- nrow(source_data)
-      
-      # Get all available column names in the actual data
-      available_cols <- names(source_data)
-      
-      # Calculate completeness for each field
-      for (field in matching_fields) {
-        # Handle journal field - check multiple possible column names
-        # In some formats, journal might be in "journal", "source", or other columns
-        field_to_check <- NULL
-        
-        if (field == "journal") {
-          # For journal, check multiple possible column names in order of preference
-          # RIS files may use T2 tag which maps to "journal" or sometimes "source"
-          journal_candidates <- c("journal", "source", "journal_name", "publication", "pub_title")
-          
-          for (candidate in journal_candidates) {
-            if (candidate %in% available_cols) {
-              # Check if this column actually has data (not all NA/empty)
-              candidate_values <- source_data[[candidate]]
-              candidate_char <- as.character(candidate_values)
-              candidate_char[is.na(candidate_char)] <- ""
-              candidate_non_empty <- sum(trimws(candidate_char) != "" & nchar(trimws(candidate_char)) > 0)
-              
-              # If candidate is "source", also verify it's different from cite_source
-              if (candidate == "source" && "cite_source" %in% available_cols) {
-                source_vals <- unique(candidate_char[trimws(candidate_char) != ""])
-                cite_source_vals <- unique(source_data$cite_source[!is.na(source_data$cite_source) & trimws(as.character(source_data$cite_source)) != ""])
-                # If source values are the same as cite_source, skip it (it's metadata, not journal)
-                if (length(source_vals) > 0 && length(cite_source_vals) > 0 && all(source_vals %in% cite_source_vals) && length(source_vals) == length(cite_source_vals)) {
-                  next  # Skip this candidate, try next one
-                }
-              }
-              
-              # If we found a candidate with data, use it
-              if (candidate_non_empty > 0) {
-                field_to_check <- candidate
-                break
-              }
-            }
-          }
-        } else if (field == "pages") {
-          # For pages, check multiple possible column names
-          # RIS files may use SP/EP tags which map to "startpage"/"endpage" or combined "pages"
-          pages_candidates <- c("pages", "startpage", "endpage", "page", "page_range")
-          
-          for (candidate in pages_candidates) {
-            if (candidate %in% available_cols) {
-              # Check if this column actually has data (not all NA/empty)
-              candidate_values <- source_data[[candidate]]
-              candidate_char <- as.character(candidate_values)
-              candidate_char[is.na(candidate_char)] <- ""
-              candidate_non_empty <- sum(trimws(candidate_char) != "" & nchar(trimws(candidate_char)) > 0)
-              
-              # If we found a candidate with data, use it
-              if (candidate_non_empty > 0) {
-                field_to_check <- candidate
-                break
-              }
-            }
-          }
-          
-          # If we have startpage or endpage but not pages, we can still count pages as present
-          # if either startpage or endpage has data
-          if (is.null(field_to_check)) {
-            if ("startpage" %in% available_cols || "endpage" %in% available_cols) {
-              startpage_vals <- if ("startpage" %in% available_cols) source_data$startpage else NULL
-              endpage_vals <- if ("endpage" %in% available_cols) source_data$endpage else NULL
-              
-              # Check if either has data
-              has_startpage <- FALSE
-              has_endpage <- FALSE
-              
-              if (!is.null(startpage_vals)) {
-                startpage_char <- as.character(startpage_vals)
-                startpage_char[is.na(startpage_char)] <- ""
-                has_startpage <- sum(trimws(startpage_char) != "" & nchar(trimws(startpage_char)) > 0) > 0
-              }
-              
-              if (!is.null(endpage_vals)) {
-                endpage_char <- as.character(endpage_vals)
-                endpage_char[is.na(endpage_char)] <- ""
-                has_endpage <- sum(trimws(endpage_char) != "" & nchar(trimws(endpage_char)) > 0) > 0
-              }
-              
-              # If either has data, we'll use startpage (or endpage if startpage doesn't exist)
-              if (has_startpage && "startpage" %in% available_cols) {
-                field_to_check <- "startpage"
-              } else if (has_endpage && "endpage" %in% available_cols) {
-                field_to_check <- "endpage"
-              }
-            }
-          }
-        } else {
-          # For other fields, just check if the column exists
-          if (field %in% available_cols) {
-            field_to_check <- field
-          }
-        }
-        
-        if (!is.null(field_to_check) && field_to_check %in% names(source_data)) {
-          # Special handling for pages when we have startpage/endpage separately
-          if (field == "pages" && (field_to_check == "startpage" || field_to_check == "endpage")) {
-            # Count records that have EITHER startpage OR endpage (or both)
-            startpage_vals <- if ("startpage" %in% available_cols) source_data$startpage else NULL
-            endpage_vals <- if ("endpage" %in% available_cols) source_data$endpage else NULL
-            
-            # Convert both to character and check for non-empty values
-            has_startpage <- rep(FALSE, total_records)
-            has_endpage <- rep(FALSE, total_records)
-            
-            if (!is.null(startpage_vals)) {
-              startpage_char <- as.character(startpage_vals)
-              startpage_char[is.na(startpage_char)] <- ""
-              startpage_char[startpage_char == "NA"] <- ""
-              startpage_char[startpage_char == "null"] <- ""
-              startpage_char[startpage_char == "NULL"] <- ""
-              has_startpage <- !is.na(startpage_vals) & trimws(startpage_char) != "" & nchar(trimws(startpage_char)) > 0
-            }
-            
-            if (!is.null(endpage_vals)) {
-              endpage_char <- as.character(endpage_vals)
-              endpage_char[is.na(endpage_char)] <- ""
-              endpage_char[endpage_char == "NA"] <- ""
-              endpage_char[endpage_char == "null"] <- ""
-              endpage_char[endpage_char == "NULL"] <- ""
-              has_endpage <- !is.na(endpage_vals) & trimws(endpage_char) != "" & nchar(trimws(endpage_char)) > 0
-            }
-            
-            # Count records that have pages (either startpage or endpage)
-            non_missing <- sum(has_startpage | has_endpage)
-          } else {
-            # Standard handling for other fields
-            field_values <- source_data[[field_to_check]]
-            
-            # Handle different data types (character, factor, list, etc.)
-            if (is.list(field_values)) {
-              # If it's a list, check if any elements are non-empty
-              field_values <- sapply(field_values, function(x) {
-                if (is.null(x) || length(x) == 0) return("")
-                if (is.character(x) || is.factor(x)) return(paste(x, collapse = " "))
-                return(as.character(x))
-              })
-            }
-            
-            # Convert to character - handle factors and other types
-            if (is.factor(field_values)) {
-              field_values_char <- as.character(field_values)
-            } else {
-              field_values_char <- as.character(field_values)
-            }
-            
-            # Replace various missing value representations with empty string
-            # First handle actual NA values
-            is_na_original <- is.na(field_values)
-            field_values_char[is_na_original] <- ""
-            
-            # Then handle string representations of missing values
-            field_values_char[field_values_char == "NA"] <- ""
-            field_values_char[field_values_char == "null"] <- ""
-            field_values_char[field_values_char == "NULL"] <- ""
-            field_values_char[field_values_char == "N/A"] <- ""
-            field_values_char[field_values_char == "n/a"] <- ""
-            field_values_char[field_values_char == "na"] <- ""
-            field_values_char[field_values_char == "Na"] <- ""
-            
-            # Trim whitespace
-            field_values_char <- trimws(field_values_char)
-            
-            # Count non-missing (not empty after trimming and not NA)
-            # Check both the original field_values (for NA) and the character version (for empty strings)
-            non_missing <- sum(
-              !is_na_original &
-              field_values_char != "" &
-              nchar(field_values_char) > 0
-            )
-          }
-          
-          completeness_pct <- if (total_records > 0) (non_missing / total_records) * 100 else 0
-          n_present <- non_missing
-        } else {
-          completeness_pct <- 0
-          n_present <- 0
-        }
-        
-        completeness_list[[length(completeness_list) + 1]] <- data.frame(
-          file_source = source,
-          field = field,
-          completeness_pct = completeness_pct,
-          n_present = n_present,
-          n_total = total_records,
-          stringsAsFactors = FALSE
-        )
-      }
-    }
-    
-    if (length(completeness_list) == 0) {
-      return(data.frame())
-    }
-    
-    # Combine into single data frame
-    completeness_df <- dplyr::bind_rows(completeness_list)
-    
-    # Order fields by importance for matching
-    field_order <- c("title", "author", "year", "doi", "journal", "abstract", 
-                     "volume", "pages", "number", "isbn")
-    completeness_df$field <- factor(completeness_df$field, levels = field_order)
-    
-    # Order sources alphabetically
-    completeness_df$file_source <- factor(completeness_df$file_source, 
-                                         levels = sort(unique(completeness_df$file_source)))
-    
-    return(completeness_df)
-  })
-  
-  # Render completeness heatmap
-  output$completeness_heatmap <- plotly::renderPlotly({
-    comp_data <- completeness_data()
-    
-    # Return empty plot if no data
-    if (!is.data.frame(comp_data) || nrow(comp_data) == 0) {
-      empty_plot <- plotly::plot_ly() %>%
-        plotly::add_annotations(
-          text = "Upload files to see metadata completeness visualization",
-          xref = "paper", yref = "paper",
-          x = 0.5, y = 0.5,
-          showarrow = FALSE,
-          font = list(size = 14)
-        ) %>%
-        plotly::layout(
-          xaxis = list(showgrid = FALSE, showticklabels = FALSE),
-          yaxis = list(showgrid = FALSE, showticklabels = FALSE)
-        )
-      return(empty_plot)
-    }
-    
-    # Create heatmap using plotly with friendlier colors
-    p <- plotly::plot_ly(
-      data = comp_data,
-      x = ~field,
-      y = ~file_source,
-      z = ~completeness_pct,
-      type = "heatmap",
-      colorscale = list(
-        c(0, "#ffcccc"),      # Light red/pink for low (0-50%)
-        c(0.5, "#fff4cc"),    # Light yellow for medium (50-75%)
-        c(0.75, "#d4edda"),   # Light green for good (75-90%)
-        c(1, "#c3e6cb")       # Medium green for excellent (90-100%)
-      ),
-      colorbar = list(
-        title = "Completeness (%)",
-        tickformat = ".0f",
-        ticksuffix = "%"
-      ),
-      text = ~paste0(
-        "File: ", file_source, "<br>",
-        "Field: ", field, "<br>",
-        "Completeness: ", round(completeness_pct, 1), "%<br>",
-        "Records: ", n_present, " / ", n_total
-      ),
-      hoverinfo = "text",
-      hovertemplate = "%{text}<extra></extra>"
-    ) %>%
-      plotly::layout(
-        title = list(
-          text = "Metadata Completeness by File and Field",
-          font = list(size = 16)
-        ),
-        xaxis = list(
-          title = "Metadata Field",
-          tickangle = -45
-        ),
-        yaxis = list(
-          title = "File / Source"
-        ),
-        margin = list(l = 150, r = 50, t = 80, b = 100)
-      )
-    
-    return(p)
-  })
-  
   shiny::observeEvent(input$file_reimport, {
     file_extension <- tolower(tools::file_ext(input$file_reimport$datapath))