import math
import operator
import matplotlib.pyplot as plt
import pandas as pan
from pandas.tools.plotting import scatter_matrix
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes
from sklearn.ensemble import RandomForestClassifier as RFC
from collections import OrderedDict
from sklearn.ensemble import AdaBoostClassifier
from sklearn import cross_validation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn import decomposition
from IPython.display import Image
def GetData(fileN ='train.csv', test_size =0.2):
''' loading with pandas'''
# reading file
datapan = pan.read_csv(fileN,header=0, na_values=['','NA','Nan'])
colnames = datapan.columns
datapan.describe(include='all')
# converting age to days
for i in datapan.iterrows():
data = str(i[1]['AgeuponOutcome'])
data = data.strip().split(' ')
if len(data)==1:
data = None
elif 'year' in data[1]:
data = int(data[0])*365
elif 'month' in data[1]:
data = round(int(data[0])*30.4,0)
elif 'week' in data[1]:
data = int(data[0])*7
else:
data = int(data[0])*7
i[1]['AgeuponOutcome'] = data
# datapan['AgeuponOutcome'].unique()
# missing values in AgeuponOutcome to mean
days= int(datapan['AgeuponOutcome'].mean())
datapan.AgeuponOutcome.fillna(days, axis=0, inplace=True)
colToKeep = ['AnimalID','OutcomeType','AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']
X = datapan[colToKeep]
#X.describe(include='all')
# removing 1 missing value
X.dropna(axis=0,inplace=True)
#X.describe(include='all')
names = ['AnimalType','SexuponOutcome','AgeuponOutcome', 'Breed', 'Color']
Xnumeric = pan.get_dummies(X[names])
Xnumeric['AnimalID']=X['AnimalID']
# Selecting pets for the training and testing datasets
# unique pets
pets = X.groupby(['AnimalID']).AnimalID.nunique()
pets = pan.DataFrame(pets)
pets['AnimalID'] = pets.index
# temporary data
smp = np.zeros(len(pets))
Xtrain, Xtest, Ptrain, Ptest = train_test_split(smp, pets, test_size=test_size, random_state=33)
Ptrain = pan.DataFrame(Ptrain)
Ptest = pan.DataFrame(Ptest)
Ptrain['AnimalID'] = Ptrain.index
Ptest['AnimalID'] = Ptest.index
# Ptrain has the pet's Ids for the training set len(Ptrain)
# Ptest has the pet's Ids for the training set len(Ptest)
# creating training and testing sets
Xtrain = Xnumeric[Xnumeric.AnimalID.isin(Ptrain.AnimalID)]
Xtest = Xnumeric[Xnumeric.AnimalID.isin(Ptest.AnimalID)]
# len(Xtrain) + len(Xtest) == len(datapan) -1
trainPets = Xtrain.groupby(['AnimalID']).AnimalID.nunique()
testPets = Xtest.groupby(['AnimalID']).AnimalID.nunique()
print 'Number of pets in training set %i, for a total of %i instances' % (len(trainPets), len(Xtrain))
print 'Number of pets in testing set %i, for a total of %i instances \n' % (len(testPets), len(Xtest))
# Getting actual outcome
cols = ['AnimalID','OutcomeType']
outcome = datapan[cols]
# creating training and testing sets
ytrain = outcome[outcome.AnimalID.isin(Ptrain.AnimalID)]
ytest = outcome[outcome.AnimalID.isin(Ptest.AnimalID)]
# Check that all paying customers are in training and testing sets
len(ytrain) + len(ytest) == len(pets)
'''
Xtrain.columns
names = ['AnimalType','SexuponOutcome','AgeuponOutcome', 'Breed', 'Color']
XtrainNumeric = pan.get_dummies(Xtrain[names])
XtestNumeric = pan.get_dummies(Xtest[names])
'''
Xtrain.pop('AnimalID')
Xtest.pop('AnimalID')
print 'Done\n'
return X, Xtrain, ytrain, Xtest, ytest
X, Xtrain, ytrain, Xtest, ytest = GetData()
%matplotlib inline
X['OutcomeType'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of OutcomeType',figsize=(10,6))
''' Adoption is the most frequente outcome followed by transfer. The shelter might not deal with some breeds? is ut full in capacity?
not there are many who died, more from human decisions'''
%matplotlib inline
X['AnimalType'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of Animal type',figsize=(10,6))
''' Most of the animal in the shelter are dogs'''
%matplotlib inline
X['SexuponOutcome'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of Sex upon outcome',figsize=(10,6))
''' Animals that are neutered or spayered have greater posibilities to be adopted, why? Intact Animals are moved to other shelters,
it could be that other shelters whant intact animal. Neutered animals are also transferred possibly as part of rotating programs?
We do not know if foster families are considered a transfer and it might be the case since is not part of the outcome options. '''
outcome_sex = pan.crosstab(X['OutcomeType'], X['SexuponOutcome'])
%matplotlib inline
plt.show(outcome_sex.plot(kind="bar", title = 'Outcome type to sex',figsize=(10,6)))
outcome_animal = pan.crosstab(X['AnimalType'], X['OutcomeType'])
%matplotlib inline
plt.show(outcome_animal .plot(kind="bar", title = 'Outcome type to Animal type',figsize=(20,6)))
animal_Sex = pan.crosstab(X['SexuponOutcome'], X['AnimalType'])
%matplotlib inline
plt.show(animal_Sex.plot(kind="bar", title = 'Animal type to Sex',figsize=(20,6)))
print '\tAge to outcome in days \n',X['AgeuponOutcome'].describe(),'\n'
outcome_age = pan.crosstab(X['AgeuponOutcome'], X['OutcomeType'])
%matplotlib inline
plt.show(outcome_age.plot(kind="bar", title = 'Age to Outcome (days)',figsize=(20,10)))
The average age of a pet when it leaves the shelter is two years and two months. Idealy we would want to see a normal distribution. That would mean that all outcomes happens for any animal regardless of its age. In the graph 3 “Age to Outcome” shown in the following page we see that animal that are 60 days old have a great possibility of being adopted. People are looking for cubs to grow. When the animals are between one and two years there are three good outcomes for the Austin Shelter. They are either adopted, transfer or return to their family. One interesting aspect of the data is that animals that get lost of escape from home and return to their owners are mostly 3 years or older. Adoptions and transfers to other shelters happens at any age. These are the two most common outcomes during the first year of life for any type of pet. Pets in bad conditions that are between one or two years old have great probabilities of dying. Adoption, which is the best possible outcome for an animal, has its peak at the second month of life and at 1 year. The trend declines as the animal grows but then it suddenly have another peak at rear 1. Etuhtanasia's peak is at two year old, followed by one year old animals. At day 30 of life many animals have the same outcome.
animal_age = pan.crosstab(X['AgeuponOutcome'], X['AnimalType'])
%matplotlib inline
plt.show(animal_age.plot(kind="bar", title = 'Animal type to age',figsize=(20,6)))
Cats are more likely to be adopted when they are 6 months old. From the outcome to age analysis we learn that adoption's peak is at day 60. From the graph Animal type to age, we add the information that adoption is more likely for cats, even when they are less in quantity. From the exploratory analysis we have found out that the animal with more probabilities to be adopted in 2 months is a neutered male cat followed by a spayed female cat. On the other hand an intact male dogs are more likely to be euthanised, followed by intac female dogs.At day 30 many animals are euthanized, and the greates probability is that is a intact male cat.
age_sex = pan.crosstab(X['AgeuponOutcome'], X['SexuponOutcome'])
%matplotlib inline
plt.show(age_sex.plot(kind="bar", title = 'Animal type to age',figsize=(20,6)))
''' Testing for max number of features at each split'''
acc_min_samples_split=[]
for i in range(10, 1000,10):
treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=i,max_leaf_nodes=12,min_samples_split= 100,
max_depth = int(round(len(Xtrain)**.5)),random_state=random_state)
treeclf = treeclf.fit(Xtrain, ytrain['OutcomeType'])
treeTest = treeclf.predict(Xtrain)
acc_min_samples_split.append(treeclf.score(Xtest, ytest['OutcomeType']))
i = [z for z in range(10, 1000,10)]
%matplotlib inline
plt.plot(i,acc_min_samples_split)
plt.xlabel("number of features at each split")
plt.ylabel("Accuracy rate")
plt.title('Accuracy by number of features')
plt.show()
'''Best accuracy is arround 400 features per split. The model would be as follows'''
treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=400,max_leaf_nodes=12,min_samples_split= 500,
max_depth = int(round(len(Xtrain)**.5)),random_state=random_state)
treeclf = treeclf.fit(Xtrain, ytrain['OutcomeType'])
treeTest = treeclf.predict(Xtrain)
print classification_report(ytrain['OutcomeType'], treeTest)
print 'Confusion Matrix'
print confusion_matrix(ytrain['OutcomeType'], treeTest)
print 'Accuracy ',treeclf.score(Xtest, ytest['OutcomeType'])*100
treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=400,max_leaf_nodes=12,min_samples_split= 500,
max_depth = int(round(len(Xtest)**.5)),random_state=random_state)
tree_scores = cross_validation.cross_val_score(treeclf, Xtest, ytest['OutcomeType'], cv=10)
print("Decision trees overall Accuracy: %0.2f (+/- %0.2f)" % (tree_scores.mean(), tree_scores.std() * 2))
'''This model has the best accuracy, however it does a bad prediciton for animals who eventually die and are ruthananized.
The best metrics are for addoption and transfer This are the most common outcomes. FOr future work implementing Adaboost for those
hard cases could be a good approach since it weights more on hard cases'''
lda = LDA()
lda = lda.fit(Xtrain, ytrain['OutcomeType'])
lda_scores = cross_validation.cross_val_score(lda, Xtest, ytest['OutcomeType'], cv=10)
print("LDA overall Accuracy: %0.2f (+/- %0.2f)" % (lda_scores.mean(), lda_scores.std() * 2))
nbclf = naive_bayes.GaussianNB()
nb_scores = cross_validation.cross_val_score(nbclf, Xtest, ytest['OutcomeType'], cv=10)
print("Naibe Bayes overall Accuracy: %0.2f (+/- %0.2f)" % (nb_scores.mean(), nb_scores.std() * 2))
random_state=33
Params = [
("RandomForestClassifier, max_features='sqrt'",
RFC(warm_start=True, oob_score=True,max_features="sqrt",criterion='gini',random_state=random_state)),
("RandomForestClassifier, max_features='log2'",
RFC(warm_start=True, max_features='log2',oob_score=True,criterion='gini',random_state=random_state)),
("RandomForestClassifier, max_features=None",
RFC(warm_start=True, max_features=None,oob_score=True,criterion='gini',random_state=random_state))]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in Params)
# Range of `n_estimators` values to explore.
min_estimators = int(len(Xtrain.columns)**(1/2.))
max_estimators = int(len(Xtrain.columns)**(1/2.)*1.7)
for label, clf in Params:
for i in range(min_estimators, max_estimators + 1):
#print i
clf.set_params(n_estimators=i)
clf.fit(Xtrain, ytrain['OutcomeType'])
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
Image('RFC OOB.png')
print("Random Forest overall Accuracy: %0.2f (+/- %0.2f)" % (np.mean(ys),np.std(ys) * 2))
print 'Best Random Forest Model Accuracy: %0.2f ' % (1-max(ys))