From bae346a69c6e4b61dfd2608ae92e584842ec57fd Mon Sep 17 00:00:00 2001 From: Ridhi Bandaru <77619512+bendemonium@users.noreply.github.com> Date: Thu, 30 Jan 2025 21:47:16 -0500 Subject: [PATCH] Add files via upload --- DMMidTerm_TeamID_02.ipynb | 1432 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1432 insertions(+) create mode 100644 DMMidTerm_TeamID_02.ipynb diff --git a/DMMidTerm_TeamID_02.ipynb b/DMMidTerm_TeamID_02.ipynb new file mode 100644 index 0000000..02b19b3 --- /dev/null +++ b/DMMidTerm_TeamID_02.ipynb @@ -0,0 +1,1432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UeVUDmw_Xuyl" + }, + "source": [ + "# NFHS Data Mining: Menstrual Products and Birth-Related Issues\n", + "\n", + "Group 2 Presents an evaluation of how the usage of different menstrual products could result in birth-related issues or pregnancy complications by studying the NFHS data set, IABR, specifically the attributes, giving insight into menstrual products used, complications in abortion (or month when pregnancy ended), size of child at birth and smoking-related habits. We tried to understand this in relation to relevant literature surrounding the \n", + "\n", + "\n", + "*Problem Statement*: finding the degree of impact of menstrual products on birth and complications related to birth\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "q1Kcaq9kRJLY" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import math\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2V6wimCAZhXT" + }, + "outputs": [], + "source": [ + "data2015 = pd.read_csv('/content/IABR2015-16.csv', low_memory=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n6iF1CiRWik0" + }, + "source": [ + "# Data Cleaning and Preprocessing\n", + "\n", + "Firstly, features related to period products used, smoking habits, birth weight, birth size, urban/rural setting, first period age, pregnancy termination, and complication in pregnancy are extracted.\n", + "\n", + "Then, this data is cleaned by eliminating the samples where the person smokes, and empty cells are either filled or removed depending on how many missing values there are.\n", + "\n", + "The preprocessing techniques employed are: **data cleaning and smoothing, feature subset selection, aggregation,** and **entropy-based discretization**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F-JiZpNdaHj5" + }, + "outputs": [], + "source": [ + "# feature subset selection\n", + "data = data2015[['S257A','S257B','S257C','S257D','S257E','S257X','V229','S240','M18','M19','V025','V024','V026','S256','V463A','V463B','V463C','V463D','V463E','V463F','V463G']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x0KezWGTVwQ3" + }, + "outputs": [], + "source": [ + "# removing rows where data related to menstrual products is not available\n", + "data['S257A'].replace(' ', np.nan, inplace=True) \n", + "data.dropna(subset = ['S257A'] , inplace = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WlObjWQ2aHG_" + }, + "source": [ + "Below, we are aggregating the features of drug consumption as it is a known fact that they affect birth, and depending on what the new 'SMOKE' feature value is, we remove the entire sample is smoking corresponds to a positive value." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "SDUrqsdvPM-q" + }, + "outputs": [], + "source": [ + "# converting smoke features to numeric data\n", + "data['V463A'] = pd.to_numeric(data['V463A'])\n", + "data['V463B'] = pd.to_numeric(data['V463B'])\n", + "data['V463C'] = pd.to_numeric(data['V463C'])\n", + "data['V463D'] = pd.to_numeric(data['V463D'])\n", + "data['V463E'] = pd.to_numeric(data['V463E'])\n", + "data['V463F'] = pd.to_numeric(data['V463F'])\n", + "data['V463G'] = pd.to_numeric(data['V463G'])\n", + "\n", + "# aggregation of smoke features, removing the smoke positive data\n", + "data['SMOKE']=data['V463A']+data['V463B']+data['V463C']+data['V463D']+data['V463E']+data['V463F']+data['V463G']\n", + "data = data[data['SMOKE'] == 0]\n", + "data = data.drop(['SMOKE','V463A','V463B','V463C','V463D','V463E','V463F','V463G','V026'],axis=1)\n", + "\n", + "# dataframe saved as a CSV file for further processing\n", + "data.to_csv('remove_smoke.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oss_mUgQahaJ" + }, + "source": [ + "Now, we are smoothing out the data by filling in the missing values with the median." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "R1T7Xn3tgrYx" + }, + "outputs": [], + "source": [ + "# filling missing values in the birth weights with the global median\n", + "data['M19'].replace([' ','9996','9998','9999'], np.nan, inplace=True)\n", + "data['M19'] = pd.to_numeric(data['M19'])\n", + "data['M19'] = data['M19'].fillna(data['M19'].median())\n", + "\n", + "# filling missing values in the birth sizes with the global median\n", + "data['M18'].replace([' ',8,9], np.nan, inplace=True)\n", + "data['M18'] = pd.to_numeric(data['M18'])\n", + "data['M18'] = data['M18'].fillna(data['M18'].median())\n", + "\n", + "# filling missing values in the age at first period with the global median\n", + "data['S256'].replace(' ', np.nan, inplace=True)\n", + "data['S256'] = pd.to_numeric(data['S256'])\n", + "data['S256'] = data['S256'].fillna(data['S256'].median())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "z7Hz7aFszwSm" + }, + "outputs": [], + "source": [ + "# cleaning pregnancy termination data\n", + "data['V229'].replace(' ', np.nan, inplace=True)\n", + "data.dropna(subset = ['V229'] , inplace = True)\n", + "data['V229'] = pd.to_numeric(data['V229'])\n", + "data.to_csv('terminated_preg.csv') # saved as a CSV for data exploration\n", + "\n", + "# cleaning abortion complication data\n", + "data['S240'].replace(' ', np.nan, inplace=True)\n", + "data.dropna(subset = ['S240'] , inplace = True)\n", + "data['S240'] = pd.to_numeric(data['S240'])\n", + "data.to_csv('abortion_complication.csv') # saved as a CSV for data exploration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "93ZddlUScg2s" + }, + "source": [ + "### Entropy-based Discretization\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "nb2VApv8L9G7" + }, + "outputs": [], + "source": [ + "def entropy(pos,neg):\n", + " total = pos + neg\n", + " pos_part = pos/total\n", + " neg_part = neg/total\n", + " if (pos == 0 or neg ==0):\n", + " return 0\n", + " else:\n", + " a = math.prod([pos_part,math.log2(pos_part)])\n", + " b = math.prod([neg_part,math.log2(neg_part)])\n", + " return (-1*(a+b))" + ] + }, + { + "cell_type": "code", + "source": [ + "data = pd.read_csv('/content/abortion_complication.csv')" + ], + "metadata": { + "id": "p5slmQdTBPc_" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We make different bins with combinations of birth size, and whether or not there was a complication in the abortion of the later pregnancy." + ], + "metadata": { + "id": "-HKyhYtUBblY" + } + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H7DZw6W8bfT9", + "outputId": "05ebbea7-6bda-4a8d-cf21-27c756a23abe" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Split 1 BIN TABLE:\n", + "13 0 13\n", + "31 2 33\n", + "86 11 97\n", + "Total Entropy: 0.42212551957759814\n" + ] + } + ], + "source": [ + "# Split 1: {1,5},{2,4},{3}\n", + "\n", + "# making bin 1\n", + "bin1_1_0= data.apply(lambda x : True if ((x['M18']==1 or x['M18']==5) and x['S240']==0) else False, axis = 1)\n", + "bin1_1_0_total = len(bin1_1_0[bin1_1_0 == True].index)\n", + "bin1_1_1= data.apply(lambda x : True if ((x['M18']==1 or x['M18']==5) and x['S240']==1) else False, axis = 1)\n", + "bin1_1_1_total = len(bin1_1_1[bin1_1_1 == True].index)\n", + "bin1_1total = bin1_1_0_total + bin1_1_1_total\n", + "# calculating entropy of bin 1\n", + "entropy1_1 = entropy(bin1_1_0_total,bin1_1_1_total)\n", + "\n", + "# making bin 2\n", + "bin1_2_0= data.apply(lambda x : True if ((x['M18'] == 2 or x['M18']==4) and x['S240']==0) else False, axis = 1)\n", + "bin1_2_0_total = len(bin1_2_0[bin1_2_0 == True].index)\n", + "bin1_2_1= data.apply(lambda x : True if ((x['M18'] == 2 or x['M18']==4) and x['S240']==1) else False, axis = 1)\n", + "bin1_2_1_total = len(bin1_2_1[bin1_2_1 == True].index)\n", + "bin1_2total = bin1_2_0_total + bin1_2_1_total\n", + "# calculating entropy of bin 2\n", + "entropy1_2 = entropy(bin1_2_0_total,bin1_2_1_total)\n", + "\n", + "# making bin 3\n", + "bin1_3_0= data.apply(lambda x : True if (x['M18'] == 3 and x['S240']==0) else False, axis = 1)\n", + "bin1_3_0_total = len(bin1_3_0[bin1_3_0 == True].index)\n", + "bin1_3_1= data.apply(lambda x : True if (x['M18'] == 3 and x['S240']==1) else False, axis = 1)\n", + "bin1_3_1_total = len(bin1_3_1[bin1_3_1 == True].index)\n", + "bin1_3total = bin1_3_0_total + bin1_3_1_total\n", + "# calculating entropy of bin 3\n", + "entropy1_3 = entropy(bin1_3_0_total,bin1_3_1_total)\n", + "\n", + "# calculating total entropy of split 1\n", + "total = bin1_1total + bin1_2total + bin1_3total\n", + "total_entropy1 = (bin1_1total*entropy1_1 + bin1_2total*entropy1_2 + bin1_3total*entropy1_3)/total\n", + "\n", + "print('Split 1 BIN TABLE:')\n", + "print(bin1_1_0_total, bin1_1_1_total, bin1_1total)\n", + "print(bin1_2_0_total, bin1_2_1_total, bin1_2total)\n", + "print(bin1_3_0_total, bin1_3_1_total, bin1_3total)\n", + "print('Total Entropy:',total_entropy1)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NvtXltfRZxk_", + "outputId": "d5fb0599-84c2-4b8b-aa2c-be11f7c3bc4a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Split 2 BIN TABLE:\n", + "25 2 27\n", + "31 2 33\n", + "86 11 97\n", + "Total Entropy: 0.47603328656035293\n" + ] + } + ], + "source": [ + "# Split 2: {1,2},{4,5},{3}\n", + "\n", + "# making bin 1\n", + "bin2_1_0= data.apply(lambda x : True if ((x['M18']==1 or x['M18']==2) and x['S240']==0) else False, axis = 1)\n", + "bin2_1_0_total = len(bin2_1_0[bin2_1_0 == True].index)\n", + "bin2_1_1= data.apply(lambda x : True if ((x['M18']==1 or x['M18']==2) and x['S240']==1) else False, axis = 1)\n", + "bin2_1_1_total = len(bin2_1_1[bin2_1_1 == True].index)\n", + "bin2_1total = bin2_1_0_total + bin2_1_1_total\n", + "# calculating entropy of bin 1\n", + "entropy2_1 = entropy(bin2_1_0_total,bin2_1_1_total)\n", + "\n", + "# making bin 2\n", + "bin2_2_0= data.apply(lambda x : True if ((x['M18'] == 4 or x['M18']==5) and x['S240']==0) else False, axis = 1)\n", + "bin2_2_0_total = len(bin2_2_0[bin2_2_0 == True].index)\n", + "bin2_2_1= data.apply(lambda x : True if ((x['M18'] == 2 or x['M18']==4) and x['S240']==1) else False, axis = 1)\n", + "bin2_2_1_total = len(bin2_2_1[bin1_2_1 == True].index)\n", + "bin2_2total = bin1_2_0_total + bin1_2_1_total\n", + "# calculating entropy of bin 2\n", + "entropy2_2 = entropy(bin2_2_0_total,bin2_2_1_total)\n", + "\n", + "# making bin 3\n", + "bin2_3_0= data.apply(lambda x : True if (x['M18'] == 3 and x['S240']==0) else False, axis = 1)\n", + "bin2_3_0_total = len(bin2_3_0[bin2_3_0 == True].index)\n", + "bin2_3_1= data.apply(lambda x : True if (x['M18'] == 3 and x['S240']==1) else False, axis = 1)\n", + "bin2_3_1_total = len(bin1_3_1[bin1_3_1 == True].index)\n", + "bin2_3total = bin2_3_0_total + bin2_3_1_total\n", + "# calculating entropy of bin 3\n", + "entropy2_3 = entropy(bin2_3_0_total,bin2_3_1_total)\n", + "\n", + "# calculating total entropy of split 2\n", + "total = bin2_1total + bin2_2total + bin2_3total\n", + "total_entropy2 = (bin2_1total*entropy2_1 + bin2_2total*entropy2_2 + bin2_3total*entropy2_3)/total\n", + "\n", + "print('Split 2 BIN TABLE:')\n", + "print(bin2_1_0_total, bin2_1_1_total, bin2_1total)\n", + "print(bin1_2_0_total, bin2_2_1_total, bin1_2total)\n", + "print(bin2_3_0_total, bin2_3_1_total, bin2_3total)\n", + "print('Total Entropy:',total_entropy2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kzg4GjWnaMri" + }, + "source": [ + "From the above Shannon Information analysis, it is evident that Split 1 contains more meaningful information." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WSE-3EVFcjT2" + }, + "source": [ + "# Data Visualization" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The menstrual products used are represnted by the following numbers:\n", + "\n", + "1. Cloth\n", + "2. Local prep\n", + "3. Sanitary napkin\n", + "4. Tampon\n", + "5. Nothing\n", + "6. Other\n" + ], + "metadata": { + "id": "2xdKWrQpEj5P" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Birth Weights and Sizes" + ], + "metadata": { + "id": "LZPt-AcLE9SH" + } + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "wYoSobNudLit" + }, + "outputs": [], + "source": [ + "plotdata = pd.read_csv('/content/remove_smoke.csv')\n", + "plotdata = plotdata.reset_index(drop=True)\n", + "\n", + "# using data without smoke for birth weight and birh size versus the period product used" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Plot of the product used and the corresponding birth weights in the data" + ], + "metadata": { + "id": "H1byR1BrDCix" + } + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "DE21OHihebSx", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 282 + }, + "outputId": "22983f81-ef56-40d2-aaab-ed0ed2647509" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 19 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "data1 = plotdata\n", + "df1 = pd.DataFrame(data1)\n", + "\n", + "## creating a values array which will store either 0 or the weight of the child, if the value of S257A is 1\n", + "y_values = []\n", + "for i in range(len(df1['S257A'])):\n", + " y_values.append(df1['S257A'][i] * df1['M19'][i])\n", + "## creating another array of same size\n", + "x_values = range(len(y_values))\n", + "## store all the non zero values in y, because for these non zero values S257A will have value 1\n", + "y_nonzero = [y for y in y_values if y != 0]\n", + "## store 1 in x for all these values \n", + "x_nonzero = [1 for i in range(len(y_values)) if y_values[i] != 0]\n", + "## plot the curve \n", + "sns.stripplot(x=x_nonzero, y=y_nonzero, jitter=0.3)\n", + "\n", + "## same for others\n", + "\n", + "\n", + "\n", + "y_values1 = []\n", + "\n", + "for i in range(len(df1['S257B'])):\n", + " y_values1.append(df1['S257B'][i] * df1['M19'][i])\n", + "x_values1 = range(len(y_values1))\n", + "y_nonzero1 = [y for y in y_values1 if y != 0]\n", + "x_nonzero1 = [2 for i in range(len(y_values1)) if y_values1[i] != 0]\n", + "sns.stripplot(x=x_nonzero1, y=y_nonzero1, jitter=0.3)\n", + "\n", + "\n", + "y_values2 = []\n", + "\n", + "for i in range(len(df1['S257C'])):\n", + " y_values2.append(df1['S257C'][i] * df1['M19'][i])\n", + "x_values2 = range(len(y_values2))\n", + "y_nonzero2 = [y for y in y_values2 if y != 0]\n", + "x_nonzero2 = [3 for i in range(len(y_values2)) if y_values2[i] != 0]\n", + "sns.stripplot(x=x_nonzero2, y=y_nonzero2, jitter=0.3)\n", + "\n", + "\n", + "y_values3 = []\n", + "\n", + "for i in range(len(df1['S257D'])):\n", + " y_values3.append(df1['S257D'][i] * df1['M19'][i])\n", + "x_values3 = range(len(y_values3))\n", + "y_nonzero3 = [y for y in y_values3 if y != 0]\n", + "x_nonzero3 = [4 for i in range(len(y_values3)) if y_values3[i] != 0]\n", + "sns.stripplot(x=x_nonzero3, y=y_nonzero3, jitter=0.3)\n", + "\n", + "\n", + "y_values4 = []\n", + "\n", + "for i in range(len(df1['S257E'])):\n", + " y_values4.append(df1['S257E'][i] * df1['M19'][i])\n", + "x_values4 = range(len(y_values4))\n", + "y_nonzero4 = [y for y in y_values4 if y != 0]\n", + "x_nonzero4 = [5 for i in range(len(y_values4)) if y_values4[i] != 0]\n", + "sns.stripplot(x=x_nonzero4, y=y_nonzero4, jitter=0.3)\n", + "\n", + "\n", + "y_values5 = []\n", + "\n", + "for i in range(len(df1['S257X'])):\n", + " y_values5.append(df1['S257X'][i] * df1['M19'][i])\n", + "x_values5 = range(len(y_values5))\n", + "y_nonzero5 = [y for y in y_values5 if y != 0]\n", + "x_nonzero5 = [6 for i in range(len(y_values5)) if y_values5[i] != 0]\n", + "sns.stripplot(x=x_nonzero5, y=y_nonzero5, jitter=0.3)\n", + "\n", + "\n", + "\n", + "\n", + "## product and weight" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Plot of the product used and the corresponding birth size in the data" + ], + "metadata": { + "id": "xScufuL9DKk2" + } + }, + { + "cell_type": "code", + "source": [ + "data = plotdata\n", + "df1 = pd.DataFrame(data)\n", + "\n", + "arr =[]\n", + "\n", + "## overall we have 5 possible values for M18 1,2,3,4,5. So first store the count of occurence of each of them\n", + "\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257A'])):\n", + " y_values.append(df1['S257A'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "## Store the count for S257A in arr\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "## Upto this point, arr will store the frequency of 1,2,3,4,5 in S257A\n", + "\n", + "\n", + "## similarly we will store the frequency for all the columns.\n", + "\n", + "#2nd \n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257B'])):\n", + " y_values.append(df1['S257B'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "#3rd\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257C'])):\n", + " y_values.append(df1['S257C'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257D'])):\n", + " y_values.append(df1['S257D'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257E'])):\n", + " y_values.append(df1['S257E'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257X'])):\n", + " y_values.append(df1['S257X'][i] * df1['M18'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "\n", + "## In the end we will have an array of size 30 thats storing the count of 1,2,3,4,5 in all the columns \n", + "\n", + "\n", + "## after that its standard procedure to create a bar chart\n", + "arr_t =[]\n", + "arr_s=[]\n", + "for i in range(1,7):\n", + " for j in range(1,6):\n", + " arr_t.append(str(i))\n", + "\n", + "for i in range(1,7):\n", + " for j in range(1,6):\n", + " arr_s.append(str(j))\n", + "\n", + "# arr_s\n", + "data = pd.DataFrame({'type':arr_t,'size':arr_s,'count':arr\n", + " })\n", + "# calculate percentages\n", + "data['percent'] = data['count'] / data.groupby('type')['count'].transform('sum') * 100\n", + "\n", + "# create stacked bar chart\n", + "sns.set_style('whitegrid')\n", + "sns.barplot(x='type', y='percent', hue='size', data=data, palette='magma')\n", + "plt.title('Distribution of size by product used')\n", + "plt.xlabel('Type')\n", + "plt.ylabel('Percentage')\n", + "plt.show()\n", + "\n", + "#size and product" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "id": "mvfmc3PYmZbv", + "outputId": "0a2abcd5-cca7-4536-d44b-353edb28147a" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Boxplot of the distribution of birth size and product used:" + ], + "metadata": { + "id": "KznONAAeDVQg" + } + }, + { + "cell_type": "code", + "source": [ + "data = plotdata\n", + "df = pd.DataFrame(data)\n", + "\n", + "y_values = []\n", + "for i in range(len(df['S257A'])):\n", + " if df['S257A'][i]==1:\n", + " y_values.append(df['M19'][i])\n", + "\n", + "y_values1 = []\n", + "for i in range(len(df['S257B'])):\n", + " if df['S257B'][i]==1:\n", + " y_values1.append(df['M19'][i])\n", + "\n", + "y_values2 = []\n", + "for i in range(len(df['S257C'])):\n", + " if df['S257C'][i]==1:\n", + " y_values2.append(df['M19'][i])\n", + "\n", + "y_values3 = []\n", + "for i in range(len(df['S257D'])):\n", + " if df['S257D'][i]==1:\n", + " y_values3.append(df['M19'][i])\n", + "\n", + "y_values4 = []\n", + "for i in range(len(df['S257E'])):\n", + " if df['S257E'][i]==1:\n", + " y_values4.append(df['M19'][i])\n", + "\n", + "y_values5 = []\n", + "for i in range(len(df['S257X'])):\n", + " if df['S257X'][i]==1:\n", + " y_values5.append(df['M19'][i]) \n", + "\n", + "NA = [73, 49, 53, 20, 20, 20, 20, 20, 20, 20, 20, 20]\n", + "HG = [73, 30, 60]\n", + "\n", + "df = pd.Series(y_values, name=\"1\").to_frame().join(pd.Series(y_values1, name=\"2\").to_frame().join(pd.Series(y_values2, name=\"3\").to_frame().join(pd.Series(y_values3, name=\"4\").to_frame().join(pd.Series(y_values4, name=\"5\").to_frame().join(pd.Series(y_values5, name=\"6\"))))))\n", + "sns.boxplot(data=df, width = 0.2) \n", + "\n", + "#size and product box plot\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 282 + }, + "id": "ukF-H146mgWm", + "outputId": "d9fd41ea-049d-4662-9671-18943ff17db3" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 22 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Termination in Pregancy and Complication in Abortion" + ], + "metadata": { + "id": "zXtWgXTuD4l_" + } + }, + { + "cell_type": "code", + "source": [ + "df1 = pd.read_csv('/content/terminated_preg.csv', low_memory=False)\n", + "# using abortion data for the next set of graphs" + ], + "metadata": { + "id": "x6Nx263uneoV" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Plot of the product used and the percentage distrubution of which month the pregnancy terminated was:" + ], + "metadata": { + "id": "9GO17yC1DnEJ" + } + }, + { + "cell_type": "code", + "source": [ + "data = df1\n", + "df1 = pd.DataFrame(data)\n", + "\n", + "## Similar to the approach used in the bar chart above\n", + "\n", + "arr =[]\n", + "\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "cnt6=0\n", + "cnt7=0\n", + "cnt8=0\n", + "cnt9=0\n", + "cnt10=0\n", + "cnt11=0\n", + "cnt12=0\n", + "\n", + "for i in range(len(df1['S257A'])):\n", + " y_values.append(df1['S257A'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "#2nd \n", + "\n", + "for i in range(len(df1['S257B'])):\n", + " y_values.append(df1['S257B'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "\n", + "#3rd\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257C'])):\n", + " y_values.append(df1['S257C'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257D'])):\n", + " y_values.append(df1['S257D'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257E'])):\n", + " y_values.append(df1['S257E'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257X'])):\n", + " y_values.append(df1['S257X'][i] * df1['V229'][i])\n", + "\n", + "for i in range(len(y_values)):\n", + " if(y_values[i]==1):\n", + " cnt1+=1\n", + " if(y_values[i]==2):\n", + " cnt2+=1\n", + " if(y_values[i]==3):\n", + " cnt3+=1\n", + " if(y_values[i]==4):\n", + " cnt4+=1\n", + " if(y_values[i]==5):\n", + " cnt5+=1\n", + " if(y_values[i]==6):\n", + " cnt6+=1\n", + " if(y_values[i]==7):\n", + " cnt7+=1\n", + " if(y_values[i]==8):\n", + " cnt8+=1\n", + " if(y_values[i]==9):\n", + " cnt9+=1\n", + " if(y_values[i]==10):\n", + " cnt10+=1\n", + " if(y_values[i]==11):\n", + " cnt11+=1\n", + " if(y_values[i]==12):\n", + " cnt12+=1\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "arr.append(cnt3);\n", + "arr.append(cnt4);\n", + "arr.append(cnt5);\n", + "arr.append(cnt6);\n", + "arr.append(cnt7);\n", + "arr.append(cnt8);\n", + "arr.append(cnt9);\n", + "arr.append(cnt10);\n", + "arr.append(cnt11);\n", + "arr.append(cnt12);\n", + "\n", + "\n", + "arr_t =[]\n", + "arr_s=[]\n", + "for i in range(1,7):\n", + " for j in range(1,13):\n", + " arr_t.append(str(i))\n", + "\n", + "for i in range(1,7):\n", + " for j in range(1,13):\n", + " arr_s.append(str(j))\n", + "\n", + "# len(arr)\n", + "# arr\n", + "\n", + "data = pd.DataFrame({'type':arr_t,'month':arr_s,'count':arr\n", + " })\n", + "# calculate percentages\n", + "data['percent'] = data['count'] / data.groupby('type')['count'].transform('sum') * 100\n", + "\n", + "# create stacked bar chart\n", + "sns.set_style('whitegrid')\n", + "sns.barplot(x='type', y='percent', hue='month', data=data, palette='magma')\n", + "plt.title('Distribution of month by product used')\n", + "plt.xlabel('Type')\n", + "plt.ylabel('Percentage')\n", + "plt.show()\n", + "\n", + "#prod and preg terminated\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "id": "xUTij5MTqjno", + "outputId": "81c7a021-4c71-4fa2-fcb0-ad17bb604761" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "df2 = pd.read_csv('/content/abortion_complication.csv')\n", + "# using abortion data for the next set of graphs" + ], + "metadata": { + "id": "mUSriF3btVmk" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Plot of the product used and distribution of whether or not there was a complication in pregnancy:" + ], + "metadata": { + "id": "nLzxkaV8EP3K" + } + }, + { + "cell_type": "code", + "source": [ + "data = df2\n", + "df1 = pd.DataFrame(data)\n", + "\n", + "## another stacked bar chart \n", + "\n", + "arr =[]\n", + "\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "\n", + "for i in range(len(df1['S257A'])):\n", + " if df1['S257A'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "\n", + "#2nd \n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257B'])):\n", + " if df1['S257B'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "\n", + "#3rd\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257C'])):\n", + " if df1['S257C'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257D'])):\n", + " if df1['S257D'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257E'])):\n", + " if df1['S257E'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "#\n", + "y_values = []\n", + "cnt1=0\n", + "cnt2=0\n", + "cnt3=0\n", + "cnt4=0\n", + "cnt5=0\n", + "for i in range(len(df1['S257X'])):\n", + " if df1['S257X'][i]==1:\n", + " if df1['S240'][i]==1:\n", + " cnt1+=1\n", + " else:\n", + " cnt2+=1\n", + "\n", + "arr.append(cnt1);\n", + "arr.append(cnt2);\n", + "\n", + "\n", + "\n", + "arr_t =[]\n", + "arr_s=[]\n", + "for i in range(1,7):\n", + " for j in range(1,3):\n", + " arr_t.append(str(i))\n", + "\n", + "for i in range(1,7):\n", + " for j in range(1,3):\n", + " arr_s.append(str(j))\n", + "\n", + "# arr_s\n", + "data = pd.DataFrame({'type':arr_t,'comp':arr_s,'count':arr\n", + " })\n", + "# calculate percentages\n", + "data['percent'] = data['count'] / data.groupby('type')['count'].transform('sum') * 100\n", + "\n", + "# create stacked bar chart\n", + "sns.set_style('whitegrid')\n", + "sns.barplot(x='type', y='percent', hue='comp', data=data, palette='magma')\n", + "plt.title('Distribution of abortion comoplication by product used')\n", + "plt.xlabel('Type')\n", + "plt.ylabel('Percentage')\n", + "plt.show()\n", + "\n", + "#product and complication" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "id": "bFKpewNrtoyZ", + "outputId": "13643235-37ad-4e56-8760-52640c4a982e" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file