Skip to content

Commit c174fcf

Browse files
committed
removed test metadata completeness feature maybe future dev
1 parent f74cc6a commit c174fcf

File tree

1 file changed

+0
-348
lines changed
  • inst/shiny-app/CiteSource

1 file changed

+0
-348
lines changed

inst/shiny-app/CiteSource/app2.R

Lines changed: 0 additions & 348 deletions
Original file line numberDiff line numberDiff line change
@@ -311,14 +311,6 @@ ui <- shiny::navbarPage("CiteSource",
311311
shiny::h5("Step 2: Double click on a column to edit sources, labels, and strings. Use *Ctrl+Enter* to save edits, one column at a time"),
312312
# Output: Data file ----
313313
DT::dataTableOutput("tbl_out"),
314-
# Metadata completeness section - only show after upload
315-
shiny::conditionalPanel(
316-
condition = "output.upload_complete",
317-
shiny::br(),
318-
shiny::h5("Metadata Completeness by File"),
319-
shiny::p("This heatmap shows the percentage of records in each file that have each metadata field populated. Fields used for matching duplicates are highlighted."),
320-
plotly::plotlyOutput("completeness_heatmap", height = "600px")
321-
)
322314
)
323315
)
324316
)
@@ -1006,346 +998,6 @@ server <- function(input, output, session) {
1006998
}
1007999
})
10081000

1009-
# Output to control visibility of completeness section
1010-
output$upload_complete <- shiny::reactive({
1011-
is.data.frame(rv$df) && nrow(rv$df) > 0
1012-
})
1013-
shiny::outputOptions(output, "upload_complete", suspendWhenHidden = FALSE)
1014-
1015-
# Reactive to calculate metadata completeness by file
1016-
completeness_data <- shiny::reactive({
1017-
# Only calculate if upload is complete (rv$df has data)
1018-
if (!is.data.frame(rv$df) || nrow(rv$df) == 0) {
1019-
return(data.frame())
1020-
}
1021-
1022-
# Use upload_df if available, otherwise use latest_unique (for reimported files)
1023-
data_to_use <- if (is.data.frame(rv$upload_df) && nrow(rv$upload_df) > 0) {
1024-
rv$upload_df
1025-
} else if (is.data.frame(rv$latest_unique) && nrow(rv$latest_unique) > 0) {
1026-
rv$latest_unique
1027-
} else {
1028-
return(data.frame())
1029-
}
1030-
1031-
# Require that data exists
1032-
if (nrow(data_to_use) == 0) {
1033-
return(data.frame())
1034-
}
1035-
1036-
# Define fields used for matching (key fields for deduplication)
1037-
matching_fields <- c(
1038-
"title", "author", "year", "journal", "abstract",
1039-
"doi", "volume", "pages", "number", "isbn"
1040-
)
1041-
1042-
# Calculate completeness for each file/source
1043-
completeness_list <- list()
1044-
1045-
# Get unique sources (files) - handle comma-separated sources by separating them
1046-
# For upload_df, each record typically has one source, but we'll handle both cases
1047-
if ("cite_source" %in% names(data_to_use)) {
1048-
# Separate comma-separated sources if needed, but keep ALL columns using helper function
1049-
source_data_expanded <- data_to_use %>%
1050-
CiteSource:::expand_single_metadata_column("cite_source")
1051-
1052-
unique_sources <- unique(source_data_expanded$cite_source)
1053-
unique_sources <- unique_sources[!is.na(unique_sources) & trimws(unique_sources) != ""]
1054-
} else {
1055-
return(data.frame())
1056-
}
1057-
1058-
if (length(unique_sources) == 0) {
1059-
return(data.frame())
1060-
}
1061-
1062-
for (source in unique_sources) {
1063-
# Filter data for this source (using expanded data where sources are already separated)
1064-
source_data <- source_data_expanded[source_data_expanded$cite_source == source, ]
1065-
1066-
if (nrow(source_data) == 0) next
1067-
1068-
total_records <- nrow(source_data)
1069-
1070-
# Get all available column names in the actual data
1071-
available_cols <- names(source_data)
1072-
1073-
# Calculate completeness for each field
1074-
for (field in matching_fields) {
1075-
# Handle journal field - check multiple possible column names
1076-
# In some formats, journal might be in "journal", "source", or other columns
1077-
field_to_check <- NULL
1078-
1079-
if (field == "journal") {
1080-
# For journal, check multiple possible column names in order of preference
1081-
# RIS files may use T2 tag which maps to "journal" or sometimes "source"
1082-
journal_candidates <- c("journal", "source", "journal_name", "publication", "pub_title")
1083-
1084-
for (candidate in journal_candidates) {
1085-
if (candidate %in% available_cols) {
1086-
# Check if this column actually has data (not all NA/empty)
1087-
candidate_values <- source_data[[candidate]]
1088-
candidate_char <- as.character(candidate_values)
1089-
candidate_char[is.na(candidate_char)] <- ""
1090-
candidate_non_empty <- sum(trimws(candidate_char) != "" & nchar(trimws(candidate_char)) > 0)
1091-
1092-
# If candidate is "source", also verify it's different from cite_source
1093-
if (candidate == "source" && "cite_source" %in% available_cols) {
1094-
source_vals <- unique(candidate_char[trimws(candidate_char) != ""])
1095-
cite_source_vals <- unique(source_data$cite_source[!is.na(source_data$cite_source) & trimws(as.character(source_data$cite_source)) != ""])
1096-
# If source values are the same as cite_source, skip it (it's metadata, not journal)
1097-
if (length(source_vals) > 0 && length(cite_source_vals) > 0 && all(source_vals %in% cite_source_vals) && length(source_vals) == length(cite_source_vals)) {
1098-
next # Skip this candidate, try next one
1099-
}
1100-
}
1101-
1102-
# If we found a candidate with data, use it
1103-
if (candidate_non_empty > 0) {
1104-
field_to_check <- candidate
1105-
break
1106-
}
1107-
}
1108-
}
1109-
} else if (field == "pages") {
1110-
# For pages, check multiple possible column names
1111-
# RIS files may use SP/EP tags which map to "startpage"/"endpage" or combined "pages"
1112-
pages_candidates <- c("pages", "startpage", "endpage", "page", "page_range")
1113-
1114-
for (candidate in pages_candidates) {
1115-
if (candidate %in% available_cols) {
1116-
# Check if this column actually has data (not all NA/empty)
1117-
candidate_values <- source_data[[candidate]]
1118-
candidate_char <- as.character(candidate_values)
1119-
candidate_char[is.na(candidate_char)] <- ""
1120-
candidate_non_empty <- sum(trimws(candidate_char) != "" & nchar(trimws(candidate_char)) > 0)
1121-
1122-
# If we found a candidate with data, use it
1123-
if (candidate_non_empty > 0) {
1124-
field_to_check <- candidate
1125-
break
1126-
}
1127-
}
1128-
}
1129-
1130-
# If we have startpage or endpage but not pages, we can still count pages as present
1131-
# if either startpage or endpage has data
1132-
if (is.null(field_to_check)) {
1133-
if ("startpage" %in% available_cols || "endpage" %in% available_cols) {
1134-
startpage_vals <- if ("startpage" %in% available_cols) source_data$startpage else NULL
1135-
endpage_vals <- if ("endpage" %in% available_cols) source_data$endpage else NULL
1136-
1137-
# Check if either has data
1138-
has_startpage <- FALSE
1139-
has_endpage <- FALSE
1140-
1141-
if (!is.null(startpage_vals)) {
1142-
startpage_char <- as.character(startpage_vals)
1143-
startpage_char[is.na(startpage_char)] <- ""
1144-
has_startpage <- sum(trimws(startpage_char) != "" & nchar(trimws(startpage_char)) > 0) > 0
1145-
}
1146-
1147-
if (!is.null(endpage_vals)) {
1148-
endpage_char <- as.character(endpage_vals)
1149-
endpage_char[is.na(endpage_char)] <- ""
1150-
has_endpage <- sum(trimws(endpage_char) != "" & nchar(trimws(endpage_char)) > 0) > 0
1151-
}
1152-
1153-
# If either has data, we'll use startpage (or endpage if startpage doesn't exist)
1154-
if (has_startpage && "startpage" %in% available_cols) {
1155-
field_to_check <- "startpage"
1156-
} else if (has_endpage && "endpage" %in% available_cols) {
1157-
field_to_check <- "endpage"
1158-
}
1159-
}
1160-
}
1161-
} else {
1162-
# For other fields, just check if the column exists
1163-
if (field %in% available_cols) {
1164-
field_to_check <- field
1165-
}
1166-
}
1167-
1168-
if (!is.null(field_to_check) && field_to_check %in% names(source_data)) {
1169-
# Special handling for pages when we have startpage/endpage separately
1170-
if (field == "pages" && (field_to_check == "startpage" || field_to_check == "endpage")) {
1171-
# Count records that have EITHER startpage OR endpage (or both)
1172-
startpage_vals <- if ("startpage" %in% available_cols) source_data$startpage else NULL
1173-
endpage_vals <- if ("endpage" %in% available_cols) source_data$endpage else NULL
1174-
1175-
# Convert both to character and check for non-empty values
1176-
has_startpage <- rep(FALSE, total_records)
1177-
has_endpage <- rep(FALSE, total_records)
1178-
1179-
if (!is.null(startpage_vals)) {
1180-
startpage_char <- as.character(startpage_vals)
1181-
startpage_char[is.na(startpage_char)] <- ""
1182-
startpage_char[startpage_char == "NA"] <- ""
1183-
startpage_char[startpage_char == "null"] <- ""
1184-
startpage_char[startpage_char == "NULL"] <- ""
1185-
has_startpage <- !is.na(startpage_vals) & trimws(startpage_char) != "" & nchar(trimws(startpage_char)) > 0
1186-
}
1187-
1188-
if (!is.null(endpage_vals)) {
1189-
endpage_char <- as.character(endpage_vals)
1190-
endpage_char[is.na(endpage_char)] <- ""
1191-
endpage_char[endpage_char == "NA"] <- ""
1192-
endpage_char[endpage_char == "null"] <- ""
1193-
endpage_char[endpage_char == "NULL"] <- ""
1194-
has_endpage <- !is.na(endpage_vals) & trimws(endpage_char) != "" & nchar(trimws(endpage_char)) > 0
1195-
}
1196-
1197-
# Count records that have pages (either startpage or endpage)
1198-
non_missing <- sum(has_startpage | has_endpage)
1199-
} else {
1200-
# Standard handling for other fields
1201-
field_values <- source_data[[field_to_check]]
1202-
1203-
# Handle different data types (character, factor, list, etc.)
1204-
if (is.list(field_values)) {
1205-
# If it's a list, check if any elements are non-empty
1206-
field_values <- sapply(field_values, function(x) {
1207-
if (is.null(x) || length(x) == 0) return("")
1208-
if (is.character(x) || is.factor(x)) return(paste(x, collapse = " "))
1209-
return(as.character(x))
1210-
})
1211-
}
1212-
1213-
# Convert to character - handle factors and other types
1214-
if (is.factor(field_values)) {
1215-
field_values_char <- as.character(field_values)
1216-
} else {
1217-
field_values_char <- as.character(field_values)
1218-
}
1219-
1220-
# Replace various missing value representations with empty string
1221-
# First handle actual NA values
1222-
is_na_original <- is.na(field_values)
1223-
field_values_char[is_na_original] <- ""
1224-
1225-
# Then handle string representations of missing values
1226-
field_values_char[field_values_char == "NA"] <- ""
1227-
field_values_char[field_values_char == "null"] <- ""
1228-
field_values_char[field_values_char == "NULL"] <- ""
1229-
field_values_char[field_values_char == "N/A"] <- ""
1230-
field_values_char[field_values_char == "n/a"] <- ""
1231-
field_values_char[field_values_char == "na"] <- ""
1232-
field_values_char[field_values_char == "Na"] <- ""
1233-
1234-
# Trim whitespace
1235-
field_values_char <- trimws(field_values_char)
1236-
1237-
# Count non-missing (not empty after trimming and not NA)
1238-
# Check both the original field_values (for NA) and the character version (for empty strings)
1239-
non_missing <- sum(
1240-
!is_na_original &
1241-
field_values_char != "" &
1242-
nchar(field_values_char) > 0
1243-
)
1244-
}
1245-
1246-
completeness_pct <- if (total_records > 0) (non_missing / total_records) * 100 else 0
1247-
n_present <- non_missing
1248-
} else {
1249-
completeness_pct <- 0
1250-
n_present <- 0
1251-
}
1252-
1253-
completeness_list[[length(completeness_list) + 1]] <- data.frame(
1254-
file_source = source,
1255-
field = field,
1256-
completeness_pct = completeness_pct,
1257-
n_present = n_present,
1258-
n_total = total_records,
1259-
stringsAsFactors = FALSE
1260-
)
1261-
}
1262-
}
1263-
1264-
if (length(completeness_list) == 0) {
1265-
return(data.frame())
1266-
}
1267-
1268-
# Combine into single data frame
1269-
completeness_df <- dplyr::bind_rows(completeness_list)
1270-
1271-
# Order fields by importance for matching
1272-
field_order <- c("title", "author", "year", "doi", "journal", "abstract",
1273-
"volume", "pages", "number", "isbn")
1274-
completeness_df$field <- factor(completeness_df$field, levels = field_order)
1275-
1276-
# Order sources alphabetically
1277-
completeness_df$file_source <- factor(completeness_df$file_source,
1278-
levels = sort(unique(completeness_df$file_source)))
1279-
1280-
return(completeness_df)
1281-
})
1282-
1283-
# Render completeness heatmap
1284-
output$completeness_heatmap <- plotly::renderPlotly({
1285-
comp_data <- completeness_data()
1286-
1287-
# Return empty plot if no data
1288-
if (!is.data.frame(comp_data) || nrow(comp_data) == 0) {
1289-
empty_plot <- plotly::plot_ly() %>%
1290-
plotly::add_annotations(
1291-
text = "Upload files to see metadata completeness visualization",
1292-
xref = "paper", yref = "paper",
1293-
x = 0.5, y = 0.5,
1294-
showarrow = FALSE,
1295-
font = list(size = 14)
1296-
) %>%
1297-
plotly::layout(
1298-
xaxis = list(showgrid = FALSE, showticklabels = FALSE),
1299-
yaxis = list(showgrid = FALSE, showticklabels = FALSE)
1300-
)
1301-
return(empty_plot)
1302-
}
1303-
1304-
# Create heatmap using plotly with friendlier colors
1305-
p <- plotly::plot_ly(
1306-
data = comp_data,
1307-
x = ~field,
1308-
y = ~file_source,
1309-
z = ~completeness_pct,
1310-
type = "heatmap",
1311-
colorscale = list(
1312-
c(0, "#ffcccc"), # Light red/pink for low (0-50%)
1313-
c(0.5, "#fff4cc"), # Light yellow for medium (50-75%)
1314-
c(0.75, "#d4edda"), # Light green for good (75-90%)
1315-
c(1, "#c3e6cb") # Medium green for excellent (90-100%)
1316-
),
1317-
colorbar = list(
1318-
title = "Completeness (%)",
1319-
tickformat = ".0f",
1320-
ticksuffix = "%"
1321-
),
1322-
text = ~paste0(
1323-
"File: ", file_source, "<br>",
1324-
"Field: ", field, "<br>",
1325-
"Completeness: ", round(completeness_pct, 1), "%<br>",
1326-
"Records: ", n_present, " / ", n_total
1327-
),
1328-
hoverinfo = "text",
1329-
hovertemplate = "%{text}<extra></extra>"
1330-
) %>%
1331-
plotly::layout(
1332-
title = list(
1333-
text = "Metadata Completeness by File and Field",
1334-
font = list(size = 16)
1335-
),
1336-
xaxis = list(
1337-
title = "Metadata Field",
1338-
tickangle = -45
1339-
),
1340-
yaxis = list(
1341-
title = "File / Source"
1342-
),
1343-
margin = list(l = 150, r = 50, t = 80, b = 100)
1344-
)
1345-
1346-
return(p)
1347-
})
1348-
13491001
shiny::observeEvent(input$file_reimport, {
13501002
file_extension <- tolower(tools::file_ext(input$file_reimport$datapath))
13511003

0 commit comments

Comments
 (0)