Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rdattafl authored Aug 25, 2021
1 parent 33357e6 commit c5556b2
Show file tree
Hide file tree
Showing 3 changed files with 410 additions and 0 deletions.
369 changes: 369 additions & 0 deletions TestData_CHD/CHD_Data_ML_Prep.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/Users/dattarij/Documents/AutoMLPipe-BC-main/TestData_CHD'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('CHOP_features_1104.txt', delim_whitespace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(810, 184527)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df.insert(1, \"InstanceID\", [i for i in range(len(df))])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4401403\n"
]
}
],
"source": [
"result = df.isna().sum().sum()\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.029447348432465336\n"
]
}
],
"source": [
"cells = 810*184527\n",
"naPerc = result/float(cells)\n",
"print(naPerc)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method NDFrame.head of PHENOTYPE InstanceID rs3131972_A rs6604966_C rs9442373_C \\\n",
"0 1 0 0.0 1.0 1 \n",
"1 1 1 0.0 0.0 0 \n",
"2 1 2 0.0 0.0 1 \n",
"3 1 3 0.0 0.0 0 \n",
"4 1 4 1.0 0.0 1 \n",
".. ... ... ... ... ... \n",
"805 2 805 0.0 0.0 1 \n",
"806 2 806 0.0 1.0 1 \n",
"807 2 807 0.0 0.0 0 \n",
"808 2 808 0.0 0.0 2 \n",
"809 2 809 1.0 1.0 1 \n",
"\n",
" rs9442380_T rs4081334_G rs2887286_C rs111452679_A rs11260577_C ... \\\n",
"0 0 0.0 0 1.0 0.0 ... \n",
"1 0 0.0 0 0.0 0.0 ... \n",
"2 0 0.0 0 0.0 0.0 ... \n",
"3 0 0.0 1 1.0 0.0 ... \n",
"4 0 0.0 0 0.0 0.0 ... \n",
".. ... ... ... ... ... ... \n",
"805 0 0.0 0 0.0 0.0 ... \n",
"806 0 0.0 0 0.0 0.0 ... \n",
"807 0 0.0 0 0.0 0.0 ... \n",
"808 0 0.0 0 1.0 1.0 ... \n",
"809 1 1.0 1 0.0 1.0 ... \n",
"\n",
" rs5770807_C rs78309468_A rs134774_A rs9628185_C rs9616816_A \\\n",
"0 1.0 0.0 1.0 2.0 0 \n",
"1 0.0 0.0 1.0 0.0 0 \n",
"2 1.0 1.0 0.0 0.0 0 \n",
"3 1.0 0.0 0.0 1.0 1 \n",
"4 0.0 0.0 0.0 1.0 0 \n",
".. ... ... ... ... ... \n",
"805 1.0 0.0 0.0 1.0 2 \n",
"806 1.0 0.0 0.0 1.0 1 \n",
"807 1.0 0.0 1.0 0.0 0 \n",
"808 2.0 0.0 0.0 NaN 1 \n",
"809 1.0 0.0 0.0 0.0 0 \n",
"\n",
" rs5770988_A rs5770821_G rs715586_T rs756638_A rs3810648_G \n",
"0 0.0 1.0 0 2 0 \n",
"1 1.0 1.0 0 0 0 \n",
"2 0.0 0.0 0 1 0 \n",
"3 0.0 NaN 2 0 0 \n",
"4 1.0 NaN 0 1 0 \n",
".. ... ... ... ... ... \n",
"805 0.0 2.0 1 0 0 \n",
"806 0.0 1.0 0 1 1 \n",
"807 NaN 1.0 0 0 0 \n",
"808 0.0 NaN 2 0 0 \n",
"809 0.0 2.0 0 0 0 \n",
"\n",
"[810 rows x 184528 columns]>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method NDFrame.head of PHENOTYPE InstanceID rs3131972_A rs6604966_C rs9442373_C rs9442380_T \\\n",
"0 1 0 0 1 1 0 \n",
"1 1 1 0 0 0 0 \n",
"2 1 2 0 0 1 0 \n",
"3 1 3 0 0 0 0 \n",
"4 1 4 1 0 1 0 \n",
".. ... ... ... ... ... ... \n",
"805 2 805 0 0 1 0 \n",
"806 2 806 0 1 1 0 \n",
"807 2 807 0 0 0 0 \n",
"808 2 808 0 0 2 0 \n",
"809 2 809 1 1 1 1 \n",
"\n",
" rs4081334_G rs2887286_C rs111452679_A rs11260577_C ... rs5770807_C \\\n",
"0 0 0 1 0 ... 1 \n",
"1 0 0 0 0 ... 0 \n",
"2 0 0 0 0 ... 1 \n",
"3 0 1 1 0 ... 1 \n",
"4 0 0 0 0 ... 0 \n",
".. ... ... ... ... ... ... \n",
"805 0 0 0 0 ... 1 \n",
"806 0 0 0 0 ... 1 \n",
"807 0 0 0 0 ... 1 \n",
"808 0 0 1 1 ... 2 \n",
"809 1 1 0 1 ... 1 \n",
"\n",
" rs78309468_A rs134774_A rs9628185_C rs9616816_A rs5770988_A rs5770821_G \\\n",
"0 0 1 2 0 0 1 \n",
"1 0 1 0 0 1 1 \n",
"2 1 0 0 0 0 0 \n",
"3 0 0 1 1 0 NaN \n",
"4 0 0 1 0 1 NaN \n",
".. ... ... ... ... ... ... \n",
"805 0 0 1 2 0 2 \n",
"806 0 0 1 1 0 1 \n",
"807 0 1 0 0 NaN 1 \n",
"808 0 0 NaN 1 0 NaN \n",
"809 0 0 0 0 0 2 \n",
"\n",
" rs715586_T rs756638_A rs3810648_G \n",
"0 0 2 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 2 0 0 \n",
"4 0 1 0 \n",
".. ... ... ... \n",
"805 1 0 0 \n",
"806 0 1 1 \n",
"807 0 0 0 \n",
"808 2 0 0 \n",
"809 0 0 0 \n",
"\n",
"[810 rows x 184528 columns]>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df.fillna(-1)\n",
"df2 = df2.astype(int)\n",
"df2 = df2.astype(str)\n",
"df2 = df2.replace('-1', np.nan)\n",
"df2.head"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": [
"<bound method NDFrame.head of Class InstanceID rs3131972_A rs6604966_C rs9442373_C rs9442380_T \\\n",
"0 1 0 0 1 1 0 \n",
"1 1 1 0 0 0 0 \n",
"2 1 2 0 0 1 0 \n",
"3 1 3 0 0 0 0 \n",
"4 1 4 1 0 1 0 \n",
".. ... ... ... ... ... ... \n",
"805 2 805 0 0 1 0 \n",
"806 2 806 0 1 1 0 \n",
"807 2 807 0 0 0 0 \n",
"808 2 808 0 0 2 0 \n",
"809 2 809 1 1 1 1 \n",
"\n",
" rs4081334_G rs2887286_C rs111452679_A rs11260577_C ... rs5770807_C \\\n",
"0 0 0 1 0 ... 1 \n",
"1 0 0 0 0 ... 0 \n",
"2 0 0 0 0 ... 1 \n",
"3 0 1 1 0 ... 1 \n",
"4 0 0 0 0 ... 0 \n",
".. ... ... ... ... ... ... \n",
"805 0 0 0 0 ... 1 \n",
"806 0 0 0 0 ... 1 \n",
"807 0 0 0 0 ... 1 \n",
"808 0 0 1 1 ... 2 \n",
"809 1 1 0 1 ... 1 \n",
"\n",
" rs78309468_A rs134774_A rs9628185_C rs9616816_A rs5770988_A rs5770821_G \\\n",
"0 0 1 2 0 0 1 \n",
"1 0 1 0 0 1 1 \n",
"2 1 0 0 0 0 0 \n",
"3 0 0 1 1 0 NaN \n",
"4 0 0 1 0 1 NaN \n",
".. ... ... ... ... ... ... \n",
"805 0 0 1 2 0 2 \n",
"806 0 0 1 1 0 1 \n",
"807 0 1 0 0 NaN 1 \n",
"808 0 0 NaN 1 0 NaN \n",
"809 0 0 0 0 0 2 \n",
"\n",
" rs715586_T rs756638_A rs3810648_G \n",
"0 0 2 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 2 0 0 \n",
"4 0 1 0 \n",
".. ... ... ... \n",
"805 1 0 0 \n",
"806 0 1 1 \n",
"807 0 0 0 \n",
"808 2 0 0 \n",
"809 0 0 0 \n",
"\n",
"[810 rows x 184528 columns]>"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.rename(columns={\"PHENOTYPE\": \"Class\"}, inplace=True)\n",
"df2.head"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df2.to_csv('CHOP_features_1104_PHENOTYPE.txt', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit c5556b2

Please sign in to comment.