file changes

MarcioPorto · Mar 26, 2020 · a7992b4 · a7992b4
1 parent 7eec3db
commit a7992b4
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,7 @@
 *.csv
 
 # vscode files
-.vscode
+.vscode
+
+# mac files
+.DS_Store
diff --git a/scripts/README.md b/scripts/README.md
@@ -1,16 +1,17 @@
 ### Instructions to generate csv from i2b2 Obesity Challenge xml files
 
-First gain access to dataset. Once you have the access, you need to download the following xml files and rename exactly as below:
-1. Download Training Data: Obesity Training Records (XML) as rename as `text1.xml`
-2. Download Training Data: The second set of Obesity Training Records (XML) as rename as `text2.xml`
-3. Download Training Data: Intuitive Annotations for Training Records (XML) as rename as `label1.xml`
-4. Download Training Data: Annotations for the second set of Obesity Training Records (XML) as rename as `label2.xml`
-5. Download Training Data: Addendum to the Intuitive Annotations for Training Records (XML) as rename as `label3.xml`
-6. Download Test Data: Test Records (XML) as rename as `test_text.xml`
-7. Download Test Data: Ground Truth for Intuitive Judgments on Test Data (XML) as rename as `test_label.xml`
+First gain access to dataset. Once you have the access, you need to download the following xml files mentioned below:
+1. Download Training Data: Obesity Training Records (XML).
+2. Download Training Data: The second set of Obesity Training Records (XML).
+3. Download Training Data: Intuitive Annotations for Training Records (XML).
+4. Download Training Data: Annotations for the second set of Obesity Training Records (XML).
+5. Download Training Data: Addendum to the Intuitive Annotations for Training Records (XML).
+6. Download Test Data: Test Records (XML).
+7. Download Test Data: Ground Truth for Intuitive Judgments on Test Data (XML).
 
-Move all the files inside the `scripts` folder.
+Move all the files inside the `downloads` folder at the root of the project.
 
-Install the dependencies first using `pip install -r requirments.txt`
+Install the dependencies using `pip install -r requirments.txt`
 
-Run `python gen_csv.py` to generate `train.csv` and `test.csv`. They will be in the `data` folder.
+Run `python gen_csv.py` to generate `train.csv` and `test.csv`. 
+They will be in the `data` folder at the root of the project.
diff --git a/scripts/gen_csv.py b/scripts/gen_csv.py
@@ -51,13 +51,16 @@ def get_attr(self):
 if not os.path.isdir('../data'):
     os.mkdir('../data')
 
+assert os.path.isdir('../downloads') , "no downloads folder found"
+
+
 dic = {}
 
-train_files = ['text1.xml','text2.xml']
+train_files = ['obesity_patient_records_training.xml','obesity_patient_records_training2.xml']
 
 for file in train_files:
     # read the training data
-    tree = ElementTree.parse('./' + file)
+    tree = ElementTree.parse('../downloads/' + file)
     root = tree.getroot()
     docs  = root.findall('docs')[0]
     for doc in docs:
@@ -74,11 +77,13 @@ def get_attr(self):
         # store in dic
         dic[pat.id] = pat
 
-train_labels = ['label1.xml','label2.xml','label3.xml']
+train_labels = ['obesity_standoff_intuitive_annotations_training.xml',
+                'obesity_standoff_annotations_training_addendum3.xml',
+                'obesity_standoff_annotations_training_addendum.xml']
 
 for file in train_labels:
     # read the annotations
-    tree = ElementTree.parse('./' + file)
+    tree = ElementTree.parse('../downloads/' + file)
     root = tree.getroot()
     diseases  = root.findall('diseases')[0]
     for disease in diseases:
@@ -95,7 +100,7 @@ def get_attr(self):
 dic_t = {}
 
 # read the test data
-tree = ElementTree.parse('./test_text.xml')
+tree = ElementTree.parse('../downloads/obesity_patient_records_test.xml')
 root = tree.getroot()
 docs  = root.findall('docs')[0]
 for doc in docs:
@@ -113,7 +118,7 @@ def get_attr(self):
     dic_t[pat.id] = pat
 
 # read the test annotations
-tree = ElementTree.parse('./test_label.xml')
+tree = ElementTree.parse('../downloads/obesity_standoff_annotations_test_intuitive.xml')
 root = tree.getroot()
 diseases  = root.findall('diseases')[0]
 for disease in diseases:
@@ -123,6 +128,6 @@ def get_attr(self):
         id_ = int(doc.attrib['id'])
         setattr(dic_t[id_] , dis_name, doc.attrib['judgment'])
 
-data_test = pd.DataFrame([pat.get_attr for pat in dic_t.values()]) 
+data_test = pd.DataFrame([pat.get_attr() for pat in dic_t.values()])
 
 data_test.to_csv('../data/test.csv', sep=',') # spit the test data to the csv