Skip to content

Commit c6aa687

Browse files
committed
fix test errors after switching to pandas
1 parent efb170d commit c6aa687

File tree

2 files changed

+94
-61
lines changed

2 files changed

+94
-61
lines changed

reproschema/redcap2reproschema.py

Lines changed: 80 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def clean_header(header):
8383
cleaned_header = {}
8484
for k, v in header.items():
8585
# Strip BOM, whitespace, and enclosing quotation marks if present
86-
cleaned_key = k.lstrip("\ufeff").strip().strip('"')
86+
cleaned_key = k.lstrip("\ufeff").strip().strip('"') if isinstance(k, str) else k
8787
cleaned_header[cleaned_key] = v
8888
return cleaned_header
8989

@@ -100,6 +100,12 @@ def normalize_condition(condition_str, field_type=None):
100100
return False
101101
elif condition_str is None:
102102
return None
103+
elif not isinstance(condition_str, str):
104+
# Convert non-string types to string, or return as is if conversion doesn't make sense
105+
try:
106+
condition_str = str(condition_str)
107+
except:
108+
return condition_str
103109

104110
re_parentheses = re.compile(r"\(([0-9]*)\)")
105111
re_non_gt_lt_equal = re.compile(r"([^>|<])=")
@@ -138,9 +144,9 @@ def process_field_properties(data):
138144
else:
139145
condition = True
140146

141-
# Check Field Annotation for special flags
142-
annotation = data.get("Field Annotation", "").upper()
143-
if condition and (
147+
# Check Field Annotation for special flags - safely handle non-string values
148+
annotation = str(data.get("Field Annotation", "")).upper() if data.get("Field Annotation") is not None else ""
149+
if condition and isinstance(annotation, str) and (
144150
"@READONLY" in annotation
145151
or "@HIDDEN" in annotation
146152
or "@CALCTEXT" in annotation
@@ -152,12 +158,15 @@ def process_field_properties(data):
152158
"isAbout": f"items/{data['Variable / Field Name']}",
153159
"isVis": condition,
154160
}
155-
if data["Required Field?"]:
156-
if data["Required Field?"] in "y":
161+
162+
# Handle Required Field check, accounting for NaN values and empty strings
163+
required_field = data.get("Required Field?")
164+
if pd.notna(required_field) and str(required_field).strip(): # Check if value is not NaN and not empty
165+
if str(required_field).lower() == "y":
157166
prop_obj["valueRequired"] = True
158-
else:
159-
raise (
160-
f"value {data['Required Field?']} not supported yet for redcap:Required Field?"
167+
elif str(required_field).lower() not in ["", "n"]: # Only raise error for unexpected values
168+
raise ValueError(
169+
f"value {required_field} not supported yet for redcap:Required Field?"
161170
)
162171
return prop_obj
163172

@@ -256,6 +265,16 @@ def process_choices(choices_str, field_name):
256265

257266
def parse_html(input_string, default_language="en"):
258267
result = {}
268+
269+
# Handle non-string input
270+
if not isinstance(input_string, str):
271+
if pd.isna(input_string): # Handle NaN values
272+
return {default_language: ""}
273+
try:
274+
input_string = str(input_string)
275+
except:
276+
return {default_language: str(input_string)}
277+
259278
soup = BeautifulSoup(input_string, "html.parser")
260279

261280
lang_elements = soup.find_all(True, {"lang": True})
@@ -268,9 +287,7 @@ def parse_html(input_string, default_language="en"):
268287
if not result: # If no text was extracted
269288
result[default_language] = soup.get_text(strip=True)
270289
else:
271-
result[default_language] = soup.get_text(
272-
strip=True
273-
) # Use the entire text as default language text
290+
result[default_language] = soup.get_text(strip=True) # Use the entire text as default language text
274291
return result
275292

276293

@@ -508,59 +525,74 @@ def parse_language_iso_codes(input_string):
508525
]
509526

510527

511-
def process_csv_with_pandas(
528+
def process_csv(
512529
csv_file, abs_folder_path, schema_context_url, protocol_name
513530
):
514531
datas = {}
515532
order = {}
516533
compute = {}
517534
languages = []
518535

519-
df = pd.read_csv(csv_file, encoding="utf-8")
520-
df = df.applymap(
521-
lambda x: x.strip() if isinstance(x, str) else x
522-
) # Clean headers
523-
524-
for form_name, group in df.groupby("Form Name"):
525-
datas[form_name] = group.to_dict(orient="records")
536+
# Read CSV with explicit BOM handling, and maintain original order
537+
df = pd.read_csv(csv_file, encoding="utf-8-sig") # utf-8-sig handles BOM automatically
538+
539+
# Clean column names (headers)
540+
df.columns = df.columns.map(lambda x: x.strip().strip('"').lstrip("\ufeff"))
541+
542+
# Clean string values in the dataframe
543+
object_columns = df.select_dtypes(include=['object']).columns
544+
for col in object_columns:
545+
df[col] = df[col].astype(str).replace('nan', '')
546+
547+
# Initialize structures for each unique form
548+
unique_forms = df["Form Name"].unique()
549+
for form_name in unique_forms:
550+
datas[form_name] = []
526551
order[form_name] = []
527552
compute[form_name] = []
528553
os.makedirs(
529554
f"{abs_folder_path}/activities/{form_name}/items", exist_ok=True
530555
)
531556

532-
# TODO: should we bring back the language
533-
# if not languages:
534-
# languages = parse_language_iso_codes(row["Field Label"])
535-
536-
for _, row in group.iterrows():
537-
field_name = row["Variable / Field Name"]
538-
if row.get("Field Type", "") in COMPUTE_LIST:
539-
# TODO: this right now doesn't give jsExpression
540-
condition = normalize_condition(
541-
row["Choices, Calculations, OR Slider Labels"],
542-
field_type=row["Field Type"],
543-
)
557+
# TODO: should we bring back the language
558+
# if not languages:
559+
# languages = parse_language_iso_codes(row["Field Label"])
560+
561+
# Process rows in original order
562+
for _, row in df.iterrows():
563+
form_name = row["Form Name"]
564+
field_name = row["Variable / Field Name"]
565+
field_type = row.get("Field Type", "")
566+
field_annotation = row.get("Field Annotation")
567+
568+
# Add row data to datas dictionary
569+
datas[form_name].append(row.to_dict())
570+
571+
if field_type in COMPUTE_LIST:
572+
condition = normalize_condition(
573+
row["Choices, Calculations, OR Slider Labels"],
574+
field_type=field_type,
575+
)
576+
compute[form_name].append(
577+
{
578+
"variableName": field_name,
579+
"jsExpression": condition,
580+
}
581+
)
582+
elif isinstance(field_annotation, str) and "@CALCTEXT" in field_annotation.upper():
583+
calc_text = field_annotation
584+
match = re.search(r"@CALCTEXT\((.*)\)", calc_text)
585+
if match:
586+
js_expression = match.group(1)
587+
js_expression = normalize_condition(js_expression)
544588
compute[form_name].append(
545589
{
546590
"variableName": field_name,
547-
"jsExpression": condition,
591+
"jsExpression": js_expression,
548592
}
549593
)
550-
elif "@CALCTEXT" in row.get("Field Annotation", "").upper():
551-
calc_text = row["Field Annotation"]
552-
match = re.search(r"@CALCTEXT\((.*)\)", calc_text)
553-
if match:
554-
js_expression = match.group(1)
555-
js_expression = normalize_condition(js_expression)
556-
compute[form_name].append(
557-
{
558-
"variableName": field_name,
559-
"jsExpression": js_expression,
560-
}
561-
)
562-
else:
563-
order[form_name].append(f"items/{field_name}")
594+
else:
595+
order[form_name].append(f"items/{field_name}")
564596

565597
os.makedirs(f"{abs_folder_path}/{protocol_name}", exist_ok=True)
566598
return datas, order, compute, languages
@@ -602,7 +634,7 @@ def redcap2reproschema(
602634
schema_context_url = CONTEXTFILE_URL
603635

604636
# Process the CSV file
605-
datas, order, compute, _ = process_csv_with_pandas(
637+
datas, order, compute, _ = process_csv(
606638
csv_file,
607639
abs_folder_path,
608640
schema_context_url,

reproschema/tests/test_redcap2reproschema.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,16 @@ def test_redcap2reproschema(tmpdir):
2323
temp_csv_file = tmpdir.join(CSV_FILE_NAME)
2424
temp_yaml_file = tmpdir.join(YAML_FILE_NAME)
2525

26-
shutil.copy(CSV_TEST_FILE, str(temp_csv_file)) # Convert to string
27-
shutil.copy(YAML_TEST_FILE, str(temp_yaml_file)) # Convert to string
28-
print("tmpdir: ", tmpdir)
29-
# Change the current working directory to tmpdir
26+
shutil.copy(CSV_TEST_FILE, str(temp_csv_file))
27+
shutil.copy(YAML_TEST_FILE, str(temp_yaml_file))
28+
29+
# Add debug output to see the content of the CSV file
30+
with open(str(temp_csv_file), 'r') as f:
31+
print("CSV content:", f.read())
32+
3033
with tmpdir.as_cwd():
3134
# Read YAML to find the expected output directory name
32-
with open(str(temp_yaml_file), "r") as file: # Convert to string
35+
with open(str(temp_yaml_file), "r") as file:
3336
protocol = yaml.safe_load(file)
3437
protocol_name = protocol.get("protocol_name", "").replace(" ", "_")
3538

@@ -39,12 +42,10 @@ def test_redcap2reproschema(tmpdir):
3942
"redcap2reproschema",
4043
str(temp_csv_file),
4144
str(temp_yaml_file),
42-
], # Convert to string
45+
],
4346
)
44-
45-
assert (
46-
result.exit_code == 0
47-
), f"The command failed to execute successfully: {result.output}"
48-
assert os.path.isdir(
49-
protocol_name
50-
), f"Expected output directory '{protocol_name}' does not exist"
47+
48+
print("Command output:", result.output) # Add debug output
49+
50+
assert result.exit_code == 0, f"Command failed with: {result.output}"
51+
assert os.path.isdir(protocol_name), f"Expected output directory '{protocol_name}' does not exist"

0 commit comments

Comments
 (0)