@@ -311,14 +311,6 @@ ui <- shiny::navbarPage("CiteSource",
311311 shiny :: h5(" Step 2: Double click on a column to edit sources, labels, and strings. Use *Ctrl+Enter* to save edits, one column at a time" ),
312312 # Output: Data file ----
313313 DT :: dataTableOutput(" tbl_out" ),
314- # Metadata completeness section - only show after upload
315- shiny :: conditionalPanel(
316- condition = " output.upload_complete" ,
317- shiny :: br(),
318- shiny :: h5(" Metadata Completeness by File" ),
319- shiny :: p(" This heatmap shows the percentage of records in each file that have each metadata field populated. Fields used for matching duplicates are highlighted." ),
320- plotly :: plotlyOutput(" completeness_heatmap" , height = " 600px" )
321- )
322314 )
323315 )
324316 )
@@ -1006,346 +998,6 @@ server <- function(input, output, session) {
1006998 }
1007999 })
10081000
1009- # Output to control visibility of completeness section
1010- output $ upload_complete <- shiny :: reactive({
1011- is.data.frame(rv $ df ) && nrow(rv $ df ) > 0
1012- })
1013- shiny :: outputOptions(output , " upload_complete" , suspendWhenHidden = FALSE )
1014-
1015- # Reactive to calculate metadata completeness by file
1016- completeness_data <- shiny :: reactive({
1017- # Only calculate if upload is complete (rv$df has data)
1018- if (! is.data.frame(rv $ df ) || nrow(rv $ df ) == 0 ) {
1019- return (data.frame ())
1020- }
1021-
1022- # Use upload_df if available, otherwise use latest_unique (for reimported files)
1023- data_to_use <- if (is.data.frame(rv $ upload_df ) && nrow(rv $ upload_df ) > 0 ) {
1024- rv $ upload_df
1025- } else if (is.data.frame(rv $ latest_unique ) && nrow(rv $ latest_unique ) > 0 ) {
1026- rv $ latest_unique
1027- } else {
1028- return (data.frame ())
1029- }
1030-
1031- # Require that data exists
1032- if (nrow(data_to_use ) == 0 ) {
1033- return (data.frame ())
1034- }
1035-
1036- # Define fields used for matching (key fields for deduplication)
1037- matching_fields <- c(
1038- " title" , " author" , " year" , " journal" , " abstract" ,
1039- " doi" , " volume" , " pages" , " number" , " isbn"
1040- )
1041-
1042- # Calculate completeness for each file/source
1043- completeness_list <- list ()
1044-
1045- # Get unique sources (files) - handle comma-separated sources by separating them
1046- # For upload_df, each record typically has one source, but we'll handle both cases
1047- if (" cite_source" %in% names(data_to_use )) {
1048- # Separate comma-separated sources if needed, but keep ALL columns using helper function
1049- source_data_expanded <- data_to_use %> %
1050- CiteSource ::: expand_single_metadata_column(" cite_source" )
1051-
1052- unique_sources <- unique(source_data_expanded $ cite_source )
1053- unique_sources <- unique_sources [! is.na(unique_sources ) & trimws(unique_sources ) != " " ]
1054- } else {
1055- return (data.frame ())
1056- }
1057-
1058- if (length(unique_sources ) == 0 ) {
1059- return (data.frame ())
1060- }
1061-
1062- for (source in unique_sources ) {
1063- # Filter data for this source (using expanded data where sources are already separated)
1064- source_data <- source_data_expanded [source_data_expanded $ cite_source == source , ]
1065-
1066- if (nrow(source_data ) == 0 ) next
1067-
1068- total_records <- nrow(source_data )
1069-
1070- # Get all available column names in the actual data
1071- available_cols <- names(source_data )
1072-
1073- # Calculate completeness for each field
1074- for (field in matching_fields ) {
1075- # Handle journal field - check multiple possible column names
1076- # In some formats, journal might be in "journal", "source", or other columns
1077- field_to_check <- NULL
1078-
1079- if (field == " journal" ) {
1080- # For journal, check multiple possible column names in order of preference
1081- # RIS files may use T2 tag which maps to "journal" or sometimes "source"
1082- journal_candidates <- c(" journal" , " source" , " journal_name" , " publication" , " pub_title" )
1083-
1084- for (candidate in journal_candidates ) {
1085- if (candidate %in% available_cols ) {
1086- # Check if this column actually has data (not all NA/empty)
1087- candidate_values <- source_data [[candidate ]]
1088- candidate_char <- as.character(candidate_values )
1089- candidate_char [is.na(candidate_char )] <- " "
1090- candidate_non_empty <- sum(trimws(candidate_char ) != " " & nchar(trimws(candidate_char )) > 0 )
1091-
1092- # If candidate is "source", also verify it's different from cite_source
1093- if (candidate == " source" && " cite_source" %in% available_cols ) {
1094- source_vals <- unique(candidate_char [trimws(candidate_char ) != " " ])
1095- cite_source_vals <- unique(source_data $ cite_source [! is.na(source_data $ cite_source ) & trimws(as.character(source_data $ cite_source )) != " " ])
1096- # If source values are the same as cite_source, skip it (it's metadata, not journal)
1097- if (length(source_vals ) > 0 && length(cite_source_vals ) > 0 && all(source_vals %in% cite_source_vals ) && length(source_vals ) == length(cite_source_vals )) {
1098- next # Skip this candidate, try next one
1099- }
1100- }
1101-
1102- # If we found a candidate with data, use it
1103- if (candidate_non_empty > 0 ) {
1104- field_to_check <- candidate
1105- break
1106- }
1107- }
1108- }
1109- } else if (field == " pages" ) {
1110- # For pages, check multiple possible column names
1111- # RIS files may use SP/EP tags which map to "startpage"/"endpage" or combined "pages"
1112- pages_candidates <- c(" pages" , " startpage" , " endpage" , " page" , " page_range" )
1113-
1114- for (candidate in pages_candidates ) {
1115- if (candidate %in% available_cols ) {
1116- # Check if this column actually has data (not all NA/empty)
1117- candidate_values <- source_data [[candidate ]]
1118- candidate_char <- as.character(candidate_values )
1119- candidate_char [is.na(candidate_char )] <- " "
1120- candidate_non_empty <- sum(trimws(candidate_char ) != " " & nchar(trimws(candidate_char )) > 0 )
1121-
1122- # If we found a candidate with data, use it
1123- if (candidate_non_empty > 0 ) {
1124- field_to_check <- candidate
1125- break
1126- }
1127- }
1128- }
1129-
1130- # If we have startpage or endpage but not pages, we can still count pages as present
1131- # if either startpage or endpage has data
1132- if (is.null(field_to_check )) {
1133- if (" startpage" %in% available_cols || " endpage" %in% available_cols ) {
1134- startpage_vals <- if (" startpage" %in% available_cols ) source_data $ startpage else NULL
1135- endpage_vals <- if (" endpage" %in% available_cols ) source_data $ endpage else NULL
1136-
1137- # Check if either has data
1138- has_startpage <- FALSE
1139- has_endpage <- FALSE
1140-
1141- if (! is.null(startpage_vals )) {
1142- startpage_char <- as.character(startpage_vals )
1143- startpage_char [is.na(startpage_char )] <- " "
1144- has_startpage <- sum(trimws(startpage_char ) != " " & nchar(trimws(startpage_char )) > 0 ) > 0
1145- }
1146-
1147- if (! is.null(endpage_vals )) {
1148- endpage_char <- as.character(endpage_vals )
1149- endpage_char [is.na(endpage_char )] <- " "
1150- has_endpage <- sum(trimws(endpage_char ) != " " & nchar(trimws(endpage_char )) > 0 ) > 0
1151- }
1152-
1153- # If either has data, we'll use startpage (or endpage if startpage doesn't exist)
1154- if (has_startpage && " startpage" %in% available_cols ) {
1155- field_to_check <- " startpage"
1156- } else if (has_endpage && " endpage" %in% available_cols ) {
1157- field_to_check <- " endpage"
1158- }
1159- }
1160- }
1161- } else {
1162- # For other fields, just check if the column exists
1163- if (field %in% available_cols ) {
1164- field_to_check <- field
1165- }
1166- }
1167-
1168- if (! is.null(field_to_check ) && field_to_check %in% names(source_data )) {
1169- # Special handling for pages when we have startpage/endpage separately
1170- if (field == " pages" && (field_to_check == " startpage" || field_to_check == " endpage" )) {
1171- # Count records that have EITHER startpage OR endpage (or both)
1172- startpage_vals <- if (" startpage" %in% available_cols ) source_data $ startpage else NULL
1173- endpage_vals <- if (" endpage" %in% available_cols ) source_data $ endpage else NULL
1174-
1175- # Convert both to character and check for non-empty values
1176- has_startpage <- rep(FALSE , total_records )
1177- has_endpage <- rep(FALSE , total_records )
1178-
1179- if (! is.null(startpage_vals )) {
1180- startpage_char <- as.character(startpage_vals )
1181- startpage_char [is.na(startpage_char )] <- " "
1182- startpage_char [startpage_char == " NA" ] <- " "
1183- startpage_char [startpage_char == " null" ] <- " "
1184- startpage_char [startpage_char == " NULL" ] <- " "
1185- has_startpage <- ! is.na(startpage_vals ) & trimws(startpage_char ) != " " & nchar(trimws(startpage_char )) > 0
1186- }
1187-
1188- if (! is.null(endpage_vals )) {
1189- endpage_char <- as.character(endpage_vals )
1190- endpage_char [is.na(endpage_char )] <- " "
1191- endpage_char [endpage_char == " NA" ] <- " "
1192- endpage_char [endpage_char == " null" ] <- " "
1193- endpage_char [endpage_char == " NULL" ] <- " "
1194- has_endpage <- ! is.na(endpage_vals ) & trimws(endpage_char ) != " " & nchar(trimws(endpage_char )) > 0
1195- }
1196-
1197- # Count records that have pages (either startpage or endpage)
1198- non_missing <- sum(has_startpage | has_endpage )
1199- } else {
1200- # Standard handling for other fields
1201- field_values <- source_data [[field_to_check ]]
1202-
1203- # Handle different data types (character, factor, list, etc.)
1204- if (is.list(field_values )) {
1205- # If it's a list, check if any elements are non-empty
1206- field_values <- sapply(field_values , function (x ) {
1207- if (is.null(x ) || length(x ) == 0 ) return (" " )
1208- if (is.character(x ) || is.factor(x )) return (paste(x , collapse = " " ))
1209- return (as.character(x ))
1210- })
1211- }
1212-
1213- # Convert to character - handle factors and other types
1214- if (is.factor(field_values )) {
1215- field_values_char <- as.character(field_values )
1216- } else {
1217- field_values_char <- as.character(field_values )
1218- }
1219-
1220- # Replace various missing value representations with empty string
1221- # First handle actual NA values
1222- is_na_original <- is.na(field_values )
1223- field_values_char [is_na_original ] <- " "
1224-
1225- # Then handle string representations of missing values
1226- field_values_char [field_values_char == " NA" ] <- " "
1227- field_values_char [field_values_char == " null" ] <- " "
1228- field_values_char [field_values_char == " NULL" ] <- " "
1229- field_values_char [field_values_char == " N/A" ] <- " "
1230- field_values_char [field_values_char == " n/a" ] <- " "
1231- field_values_char [field_values_char == " na" ] <- " "
1232- field_values_char [field_values_char == " Na" ] <- " "
1233-
1234- # Trim whitespace
1235- field_values_char <- trimws(field_values_char )
1236-
1237- # Count non-missing (not empty after trimming and not NA)
1238- # Check both the original field_values (for NA) and the character version (for empty strings)
1239- non_missing <- sum(
1240- ! is_na_original &
1241- field_values_char != " " &
1242- nchar(field_values_char ) > 0
1243- )
1244- }
1245-
1246- completeness_pct <- if (total_records > 0 ) (non_missing / total_records ) * 100 else 0
1247- n_present <- non_missing
1248- } else {
1249- completeness_pct <- 0
1250- n_present <- 0
1251- }
1252-
1253- completeness_list [[length(completeness_list ) + 1 ]] <- data.frame (
1254- file_source = source ,
1255- field = field ,
1256- completeness_pct = completeness_pct ,
1257- n_present = n_present ,
1258- n_total = total_records ,
1259- stringsAsFactors = FALSE
1260- )
1261- }
1262- }
1263-
1264- if (length(completeness_list ) == 0 ) {
1265- return (data.frame ())
1266- }
1267-
1268- # Combine into single data frame
1269- completeness_df <- dplyr :: bind_rows(completeness_list )
1270-
1271- # Order fields by importance for matching
1272- field_order <- c(" title" , " author" , " year" , " doi" , " journal" , " abstract" ,
1273- " volume" , " pages" , " number" , " isbn" )
1274- completeness_df $ field <- factor (completeness_df $ field , levels = field_order )
1275-
1276- # Order sources alphabetically
1277- completeness_df $ file_source <- factor (completeness_df $ file_source ,
1278- levels = sort(unique(completeness_df $ file_source )))
1279-
1280- return (completeness_df )
1281- })
1282-
1283- # Render completeness heatmap
1284- output $ completeness_heatmap <- plotly :: renderPlotly({
1285- comp_data <- completeness_data()
1286-
1287- # Return empty plot if no data
1288- if (! is.data.frame(comp_data ) || nrow(comp_data ) == 0 ) {
1289- empty_plot <- plotly :: plot_ly() %> %
1290- plotly :: add_annotations(
1291- text = " Upload files to see metadata completeness visualization" ,
1292- xref = " paper" , yref = " paper" ,
1293- x = 0.5 , y = 0.5 ,
1294- showarrow = FALSE ,
1295- font = list (size = 14 )
1296- ) %> %
1297- plotly :: layout(
1298- xaxis = list (showgrid = FALSE , showticklabels = FALSE ),
1299- yaxis = list (showgrid = FALSE , showticklabels = FALSE )
1300- )
1301- return (empty_plot )
1302- }
1303-
1304- # Create heatmap using plotly with friendlier colors
1305- p <- plotly :: plot_ly(
1306- data = comp_data ,
1307- x = ~ field ,
1308- y = ~ file_source ,
1309- z = ~ completeness_pct ,
1310- type = " heatmap" ,
1311- colorscale = list (
1312- c(0 , " #ffcccc" ), # Light red/pink for low (0-50%)
1313- c(0.5 , " #fff4cc" ), # Light yellow for medium (50-75%)
1314- c(0.75 , " #d4edda" ), # Light green for good (75-90%)
1315- c(1 , " #c3e6cb" ) # Medium green for excellent (90-100%)
1316- ),
1317- colorbar = list (
1318- title = " Completeness (%)" ,
1319- tickformat = " .0f" ,
1320- ticksuffix = " %"
1321- ),
1322- text = ~ paste0(
1323- " File: " , file_source , " <br>" ,
1324- " Field: " , field , " <br>" ,
1325- " Completeness: " , round(completeness_pct , 1 ), " %<br>" ,
1326- " Records: " , n_present , " / " , n_total
1327- ),
1328- hoverinfo = " text" ,
1329- hovertemplate = " %{text}<extra></extra>"
1330- ) %> %
1331- plotly :: layout(
1332- title = list (
1333- text = " Metadata Completeness by File and Field" ,
1334- font = list (size = 16 )
1335- ),
1336- xaxis = list (
1337- title = " Metadata Field" ,
1338- tickangle = - 45
1339- ),
1340- yaxis = list (
1341- title = " File / Source"
1342- ),
1343- margin = list (l = 150 , r = 50 , t = 80 , b = 100 )
1344- )
1345-
1346- return (p )
1347- })
1348-
13491001 shiny :: observeEvent(input $ file_reimport , {
13501002 file_extension <- tolower(tools :: file_ext(input $ file_reimport $ datapath ))
13511003
0 commit comments