@@ -83,7 +83,7 @@ def clean_header(header):
83
83
cleaned_header = {}
84
84
for k , v in header .items ():
85
85
# Strip BOM, whitespace, and enclosing quotation marks if present
86
- cleaned_key = k .lstrip ("\ufeff " ).strip ().strip ('"' )
86
+ cleaned_key = k .lstrip ("\ufeff " ).strip ().strip ('"' ) if isinstance ( k , str ) else k
87
87
cleaned_header [cleaned_key ] = v
88
88
return cleaned_header
89
89
@@ -100,6 +100,12 @@ def normalize_condition(condition_str, field_type=None):
100
100
return False
101
101
elif condition_str is None :
102
102
return None
103
+ elif not isinstance (condition_str , str ):
104
+ # Convert non-string types to string, or return as is if conversion doesn't make sense
105
+ try :
106
+ condition_str = str (condition_str )
107
+ except :
108
+ return condition_str
103
109
104
110
re_parentheses = re .compile (r"\(([0-9]*)\)" )
105
111
re_non_gt_lt_equal = re .compile (r"([^>|<])=" )
@@ -138,9 +144,9 @@ def process_field_properties(data):
138
144
else :
139
145
condition = True
140
146
141
- # Check Field Annotation for special flags
142
- annotation = data .get ("Field Annotation" , "" ).upper ()
143
- if condition and (
147
+ # Check Field Annotation for special flags - safely handle non-string values
148
+ annotation = str ( data .get ("Field Annotation" , "" )) .upper () if data . get ( "Field Annotation" ) is not None else ""
149
+ if condition and isinstance ( annotation , str ) and (
144
150
"@READONLY" in annotation
145
151
or "@HIDDEN" in annotation
146
152
or "@CALCTEXT" in annotation
@@ -152,12 +158,15 @@ def process_field_properties(data):
152
158
"isAbout" : f"items/{ data ['Variable / Field Name' ]} " ,
153
159
"isVis" : condition ,
154
160
}
155
- if data ["Required Field?" ]:
156
- if data ["Required Field?" ] in "y" :
161
+
162
+ # Handle Required Field check, accounting for NaN values and empty strings
163
+ required_field = data .get ("Required Field?" )
164
+ if pd .notna (required_field ) and str (required_field ).strip (): # Check if value is not NaN and not empty
165
+ if str (required_field ).lower () == "y" :
157
166
prop_obj ["valueRequired" ] = True
158
- else :
159
- raise (
160
- f"value { data [ 'Required Field?' ] } not supported yet for redcap:Required Field?"
167
+ elif str ( required_field ). lower () not in [ "" , "n" ]: # Only raise error for unexpected values
168
+ raise ValueError (
169
+ f"value { required_field } not supported yet for redcap:Required Field?"
161
170
)
162
171
return prop_obj
163
172
@@ -256,6 +265,16 @@ def process_choices(choices_str, field_name):
256
265
257
266
def parse_html (input_string , default_language = "en" ):
258
267
result = {}
268
+
269
+ # Handle non-string input
270
+ if not isinstance (input_string , str ):
271
+ if pd .isna (input_string ): # Handle NaN values
272
+ return {default_language : "" }
273
+ try :
274
+ input_string = str (input_string )
275
+ except :
276
+ return {default_language : str (input_string )}
277
+
259
278
soup = BeautifulSoup (input_string , "html.parser" )
260
279
261
280
lang_elements = soup .find_all (True , {"lang" : True })
@@ -268,9 +287,7 @@ def parse_html(input_string, default_language="en"):
268
287
if not result : # If no text was extracted
269
288
result [default_language ] = soup .get_text (strip = True )
270
289
else :
271
- result [default_language ] = soup .get_text (
272
- strip = True
273
- ) # Use the entire text as default language text
290
+ result [default_language ] = soup .get_text (strip = True ) # Use the entire text as default language text
274
291
return result
275
292
276
293
@@ -508,59 +525,74 @@ def parse_language_iso_codes(input_string):
508
525
]
509
526
510
527
511
- def process_csv_with_pandas (
528
+ def process_csv (
512
529
csv_file , abs_folder_path , schema_context_url , protocol_name
513
530
):
514
531
datas = {}
515
532
order = {}
516
533
compute = {}
517
534
languages = []
518
535
519
- df = pd .read_csv (csv_file , encoding = "utf-8" )
520
- df = df .applymap (
521
- lambda x : x .strip () if isinstance (x , str ) else x
522
- ) # Clean headers
523
-
524
- for form_name , group in df .groupby ("Form Name" ):
525
- datas [form_name ] = group .to_dict (orient = "records" )
536
+ # Read CSV with explicit BOM handling, and maintain original order
537
+ df = pd .read_csv (csv_file , encoding = "utf-8-sig" ) # utf-8-sig handles BOM automatically
538
+
539
+ # Clean column names (headers)
540
+ df .columns = df .columns .map (lambda x : x .strip ().strip ('"' ).lstrip ("\ufeff " ))
541
+
542
+ # Clean string values in the dataframe
543
+ object_columns = df .select_dtypes (include = ['object' ]).columns
544
+ for col in object_columns :
545
+ df [col ] = df [col ].astype (str ).replace ('nan' , '' )
546
+
547
+ # Initialize structures for each unique form
548
+ unique_forms = df ["Form Name" ].unique ()
549
+ for form_name in unique_forms :
550
+ datas [form_name ] = []
526
551
order [form_name ] = []
527
552
compute [form_name ] = []
528
553
os .makedirs (
529
554
f"{ abs_folder_path } /activities/{ form_name } /items" , exist_ok = True
530
555
)
531
556
532
- # TODO: should we bring back the language
533
- # if not languages:
534
- # languages = parse_language_iso_codes(row["Field Label"])
535
-
536
- for _ , row in group .iterrows ():
537
- field_name = row ["Variable / Field Name" ]
538
- if row .get ("Field Type" , "" ) in COMPUTE_LIST :
539
- # TODO: this right now doesn't give jsExpression
540
- condition = normalize_condition (
541
- row ["Choices, Calculations, OR Slider Labels" ],
542
- field_type = row ["Field Type" ],
543
- )
557
+ # TODO: should we bring back the language
558
+ # if not languages:
559
+ # languages = parse_language_iso_codes(row["Field Label"])
560
+
561
+ # Process rows in original order
562
+ for _ , row in df .iterrows ():
563
+ form_name = row ["Form Name" ]
564
+ field_name = row ["Variable / Field Name" ]
565
+ field_type = row .get ("Field Type" , "" )
566
+ field_annotation = row .get ("Field Annotation" )
567
+
568
+ # Add row data to datas dictionary
569
+ datas [form_name ].append (row .to_dict ())
570
+
571
+ if field_type in COMPUTE_LIST :
572
+ condition = normalize_condition (
573
+ row ["Choices, Calculations, OR Slider Labels" ],
574
+ field_type = field_type ,
575
+ )
576
+ compute [form_name ].append (
577
+ {
578
+ "variableName" : field_name ,
579
+ "jsExpression" : condition ,
580
+ }
581
+ )
582
+ elif isinstance (field_annotation , str ) and "@CALCTEXT" in field_annotation .upper ():
583
+ calc_text = field_annotation
584
+ match = re .search (r"@CALCTEXT\((.*)\)" , calc_text )
585
+ if match :
586
+ js_expression = match .group (1 )
587
+ js_expression = normalize_condition (js_expression )
544
588
compute [form_name ].append (
545
589
{
546
590
"variableName" : field_name ,
547
- "jsExpression" : condition ,
591
+ "jsExpression" : js_expression ,
548
592
}
549
593
)
550
- elif "@CALCTEXT" in row .get ("Field Annotation" , "" ).upper ():
551
- calc_text = row ["Field Annotation" ]
552
- match = re .search (r"@CALCTEXT\((.*)\)" , calc_text )
553
- if match :
554
- js_expression = match .group (1 )
555
- js_expression = normalize_condition (js_expression )
556
- compute [form_name ].append (
557
- {
558
- "variableName" : field_name ,
559
- "jsExpression" : js_expression ,
560
- }
561
- )
562
- else :
563
- order [form_name ].append (f"items/{ field_name } " )
594
+ else :
595
+ order [form_name ].append (f"items/{ field_name } " )
564
596
565
597
os .makedirs (f"{ abs_folder_path } /{ protocol_name } " , exist_ok = True )
566
598
return datas , order , compute , languages
@@ -602,7 +634,7 @@ def redcap2reproschema(
602
634
schema_context_url = CONTEXTFILE_URL
603
635
604
636
# Process the CSV file
605
- datas , order , compute , _ = process_csv_with_pandas (
637
+ datas , order , compute , _ = process_csv (
606
638
csv_file ,
607
639
abs_folder_path ,
608
640
schema_context_url ,
0 commit comments