Skip to content

Commit

Permalink
Add titanic dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
mwaskom committed Feb 24, 2014
1 parent dbf9774 commit 2efd4a5
Show file tree
Hide file tree
Showing 3 changed files with 1,814 additions and 0 deletions.
30 changes: 30 additions & 0 deletions process/titanic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np
import pandas as pd

def main():

raw_data = "raw/titanic.csv"
df = pd.read_csv(raw_data)

df["class"] = df.pclass.map({1: "First", 2: "Second", 3: "Third"})
df["who"] = df[["age", "sex"]].apply(woman_child_or_man, axis=1)
df["adult_male"] = df.who == "man"
df["deck"] = df.cabin.str[0].map(lambda s: np.nan if s == "T" else s)
df["embark_town"] = df.embarked.map({"C": "Cherbourg", "Q": "Queenstown", "S": "Southampton"})
df["alive"] = df.survived.map({0: "no", 1: "yes"})
df["alone"] = ~(df.parch + df.sibsp).astype(bool)
df = df.drop(["name", "ticket", "cabin"], axis=1)

df.to_csv("titanic.csv")


def woman_child_or_man(passenger):
age, sex = passenger
if age < 16:
return "child"
else:
return dict(male="man", female="woman")[sex]


if __name__ == "__main__":
main()
Loading

0 comments on commit 2efd4a5

Please sign in to comment.