import pandas as pd
data = [2,4,3,4,4]
index = range(5)
s = pd.Series(data)#, index=index)
s
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
see http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
matplotlib.style.use('ggplot')
# from http://www.analyticsvidhya.com/blog/2014/08/baby-steps-python-performing-exploratory-analysis-python/
# load the data into a python data frame
df = pd.read_csv("./titanic-train.csv")
df.head(2)
#type(df)
#generate various summary statistics
df.describe()
df.Age.hist(bins=20)
Age has (891–714=) 277 missing values.
We can also look that about 38% passangers survived the tragedy. How? The mean of survival field is 0.38 (Remember, survival has value 1 for those who survived and 0 otherwise)
By looking at percentiles of Pclass, you can see that more than 50% of passengers belong to class 3,
The age distribution seems to be in line with expectation. Same with SibSp and Parch
The fare seems to have values with 0 indicating possibility of some free tickets or data errors. On the other extreme, 512 looks like a possible outlier / error
There are 3 variety of measures, required to understand a distribution:
# Mean – or the average
print "Mean of Age:", df['Age'].mean()
# Median – the value, which divides the population in two half
print "Media of Age:", df['Age'].median()
# Mode – the most frequent value in a population
print "Mode of Pclass:", df['Pclass'].mode()
# df.Pclass.min()
see http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
print "Skew of age: ", df.Age.skew()
print "Kurtosis of age:", df.Age.kurtosis()
#Returns first n rows
df.head(3)
# not all summary statistics is displayed by describe(), e.g. the median
df['Age'].median()
df['Sex'].unique()
# Histogram with matplotlib
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
ax.hist(df['Age'], bins = 15, range = (df['Age'].min(),df['Age'].max()))
# or method hist() of a pandas series
# df['Age'].hist(bins = 10)
plt.title('Age distribution')
plt.xlabel('Age')
plt.ylabel('Count of P assengers')
plt.show()
##
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['Fare'], bins = 10, range = (df['Fare'].min(),
50.))
#df['Fare'].max()))
plt.title('Fare distribution')
plt.xlabel('Fare')
plt.ylabel('Count of Passengers')
plt.show()
df.boxplot(column='Fare', by = 'Pclass')
import seaborn as sns
sns.jointplot("Age", "Fare", df, kind='reg')
sns.lmplot("Age", "Fare", df, col="Pclass")
## Violine Plot
df_nn = df[pd.notnull(df['Age'])]
import seaborn as sns
sns.violinplot(df_nn['Age'], df_nn['Sex'], cut=0.) #Variable Plot
sns.despine()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot(111)
ax.axis("equal")
plt.title("Pclass distribution")
pgroup = df.groupby(['Pclass'])
pcouts = pgroup.PassengerId.count()
pcouts.name = "Number of Passengers per Class"
pcouts.plot(kind='Pie', autopct="%1.1f%%", ax=ax)
temp1 = df.groupby('Pclass').Survived.count()
# or temp1 = df['Survived'].groupby(df['Pclass']).count()
temp2 = df.groupby('Pclass').Survived.sum()/df.groupby('Pclass').Survived.count()
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Pclass')
ax1.set_ylabel('Count of Passengers')
ax1.set_title("Passengers by Pclass")
temp1.plot(kind='bar')
ax2 = fig.add_subplot(122)
temp2.plot(kind = 'bar')
ax2.set_xlabel('Pclass')
ax2.set_ylabel('Probability of Survival')
ax2.set_title("Probability of survival by class")
# binning can be done with pandas directly
# Categorical
age_bins = pd.cut(df.Age, 8, precision=0)
groups = df.groupby(age_bins)
groups.PassengerId.count().plot(kind='bar')
temp = groups.Survived.mean()
temp.plot(kind='bar')
group_names = ['kids', 'youth', 'adults', 'seniors']
age_bins = pd.cut(df.Age, [df.Age.min(), 14, 22, 55, df.Age.max()], labels = group_names)
groups = df.groupby(age_bins)
temp = groups.Survived.mean()
temp.plot(kind='bar')
# for discretize variable into equal-sized buckets based on rank or based on sample quantiles: see qcut
#pd.qcut?
## binning with numpy
#bins = np.round(np.linspace(0., df.Age.max(), 10))
#age_bins = pd.Series(np.digitize(df.Age, bins))
##the last bin is here nan
#age_bins[age_bins == age_bins.max()] = 'NaN'
#for i in range(len(bins)-1):
# age_bins[age_bins == i+1] = "{0:2.0f}-{1:2.0f}".format(bins[i], bins[i+1])
#groups = df.groupby(age_bins)
#temp = groups.Survived.mean()
#temp.plot(kind='bar')
data = df.groupby([age_bins,'Sex']).Survived.mean()
# Note: data has a hierachical index
data
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot(111)
ax.set_xlabel('Age')
ax.set_ylabel('Probability of Survival')
ax.set_title("Probability of survival by age and sex")
data.unstack(level=1).plot(kind='bar', subplots=False, ax=ax)
var = df.groupby(['Sex','Survived']).PassengerId.count()
ax = var.unstack().plot(kind='bar',stacked=True, color=['red','blue'], grid=False)
ax.set_xlabel('Sex')
ax.set_ylabel('Number of Passengers')
from statsmodels.graphics.mosaicplot import mosaic
_ = mosaic(df, ['Survived', 'Sex', 'Pclass'])
df['Adult'] = df["Age"].apply(lambda age: "adult" if age >14. else "child")
#or df['Adult'] = df["Age"]>14.
_ = mosaic(df, ['Survived', 'Sex', 'Pclass', 'Adult'])
#) Probability of surviving of a woman in the 3. class?
df[(df['Sex']=='female') & (df['Pclass']==3)].Survived.mean()
# more plots with pandas see http://pandas.pydata.org/pandas-docs/stable/visualization.html
# Some simple string handling for getting the salutations
salutation = df.Name.apply(lambda w: w.split(',')[1].split('.')[0].strip())
# df.Saluation = ... Doesn't work here properly, because the column doesn't exists right now !!
df['Salutation'] = salutation
df.groupby('Salutation').PassengerId.count()
# use 'Other' for the low count salutations
df.Salutation [np.invert( (df.Salutation == 'Mr') | (df.Salutation == 'Mrs') | (df.Salutation == 'Miss') | (df.Salutation == 'Master') )] = 'Other'
df.boxplot(column='Age', by = 'Salutation')
################
# Fill missing values with pandas (alternative: scikit learn imputter)
# After the explorative data analysis!
medianAge = df.Age.median()
df.Age = df.Age.fillna(medianAge)
##################