Documented Python Code

Loading packages

In [100]:
import math
import operator
import matplotlib.pyplot as plt
import pandas as pan
from pandas.tools.plotting import scatter_matrix
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes 
from sklearn.ensemble import RandomForestClassifier as RFC
from collections import OrderedDict
from sklearn.ensemble import AdaBoostClassifier
from sklearn import cross_validation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn import decomposition
from IPython.display import Image

Data preprocessing

In [17]:
def GetData(fileN ='train.csv', test_size =0.2):
    ''' loading with pandas'''
    # reading file
    datapan = pan.read_csv(fileN,header=0, na_values=['','NA','Nan'])   
    colnames = datapan.columns    
    datapan.describe(include='all')
    
    # converting age to days
    for i in datapan.iterrows():
        data  = str(i[1]['AgeuponOutcome'])
        data = data.strip().split(' ')
        if len(data)==1:
            data = None
        elif 'year' in data[1]:
            data = int(data[0])*365
        elif 'month' in data[1]:
            data = round(int(data[0])*30.4,0)
        elif 'week' in data[1]:
            data = int(data[0])*7
        else:
            data = int(data[0])*7
        i[1]['AgeuponOutcome'] = data
  
   
    # datapan['AgeuponOutcome'].unique()
    # missing values in AgeuponOutcome to mean
    days= int(datapan['AgeuponOutcome'].mean())
    datapan.AgeuponOutcome.fillna(days, axis=0, inplace=True)
    colToKeep = ['AnimalID','OutcomeType','AnimalType',	'SexuponOutcome',	'AgeuponOutcome',	'Breed',	'Color']
    X = datapan[colToKeep]
    
    #X.describe(include='all')
    # removing 1 missing value
    X.dropna(axis=0,inplace=True)
    #X.describe(include='all')

    
    names = ['AnimalType','SexuponOutcome','AgeuponOutcome',	'Breed',	'Color']
    Xnumeric = pan.get_dummies(X[names])
    Xnumeric['AnimalID']=X['AnimalID']
    
    # Selecting pets for the training and testing datasets
    # unique pets
    pets = X.groupby(['AnimalID']).AnimalID.nunique()
    pets = pan.DataFrame(pets)
    pets['AnimalID'] = pets.index
    # temporary data
    smp = np.zeros(len(pets))
    Xtrain, Xtest, Ptrain, Ptest = train_test_split(smp, pets, test_size=test_size, random_state=33)
    Ptrain = pan.DataFrame(Ptrain)
    Ptest = pan.DataFrame(Ptest)
    Ptrain['AnimalID'] = Ptrain.index
    Ptest['AnimalID'] = Ptest.index
    # Ptrain has the pet's Ids for the training set len(Ptrain)
    # Ptest has the pet's Ids for the training set len(Ptest)
    
    
    # creating training and testing sets
    Xtrain = Xnumeric[Xnumeric.AnimalID.isin(Ptrain.AnimalID)]
    Xtest = Xnumeric[Xnumeric.AnimalID.isin(Ptest.AnimalID)]
    # len(Xtrain) + len(Xtest) == len(datapan) -1
    
    trainPets = Xtrain.groupby(['AnimalID']).AnimalID.nunique()
    testPets = Xtest.groupby(['AnimalID']).AnimalID.nunique()
    print 'Number of pets in training set %i, for a total of %i instances' % (len(trainPets), len(Xtrain))
    print 'Number of pets in testing  set %i, for a total of %i instances \n' % (len(testPets), len(Xtest))
    
    
    # Getting actual outcome 
    cols = ['AnimalID','OutcomeType']
    outcome = datapan[cols]
    # creating training and testing sets
    ytrain = outcome[outcome.AnimalID.isin(Ptrain.AnimalID)]
    ytest = outcome[outcome.AnimalID.isin(Ptest.AnimalID)]
    # Check that all paying customers are in training and testing sets
    len(ytrain) + len(ytest) == len(pets)
    
    '''    
    Xtrain.columns
    names = ['AnimalType','SexuponOutcome','AgeuponOutcome',	'Breed',	'Color']
    XtrainNumeric = pan.get_dummies(Xtrain[names])
    XtestNumeric = pan.get_dummies(Xtest[names])
    '''
    Xtrain.pop('AnimalID')
    Xtest.pop('AnimalID')
    print 'Done\n'
    return X, Xtrain, ytrain, Xtest, ytest


X, Xtrain, ytrain, Xtest, ytest = GetData()
Number of pets in training set 21382, for a total of 21382 instances
Number of pets in testing  set 5346, for a total of 5346 instances 

Done

C:\Users\Julio\Anaconda2\lib\site-packages\ipykernel\__main__.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Exploratory data analysis

In [79]:
%matplotlib inline
X['OutcomeType'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of OutcomeType',figsize=(10,6))
Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x408ca400>
In [ ]:
''' Adoption is the most frequente outcome followed by transfer. The shelter might not deal with some breeds?  is ut full in capacity?
not there are many who died, more from human decisions'''
In [80]:
%matplotlib inline
X['AnimalType'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of Animal type',figsize=(10,6))
''' Most of the animal in the shelter are dogs'''
Out[80]:
' Most of the animal in the shelter are dogs'
In [76]:
%matplotlib inline
X['SexuponOutcome'].value_counts().plot(kind='bar', color = 'green', title = 'Distribution of Sex upon outcome',figsize=(10,6))
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x64bf4160>
In [72]:
''' Animals that are neutered or spayered have greater posibilities to be adopted, why? Intact Animals are moved to other shelters,
it could be that other shelters whant intact animal. Neutered animals are also transferred possibly as part of rotating programs? 
We do not know if foster families are considered a transfer and it might be the case since is not part of the outcome options. '''

outcome_sex = pan.crosstab(X['OutcomeType'], X['SexuponOutcome'])
%matplotlib inline
plt.show(outcome_sex.plot(kind="bar", title = 'Outcome type to sex',figsize=(10,6)))
In [83]:
outcome_animal = pan.crosstab(X['AnimalType'], X['OutcomeType'])
%matplotlib inline
plt.show(outcome_animal .plot(kind="bar", title = 'Outcome type to Animal type',figsize=(20,6)))

animal_Sex = pan.crosstab(X['SexuponOutcome'], X['AnimalType'])
%matplotlib inline
plt.show(animal_Sex.plot(kind="bar", title = 'Animal type to Sex',figsize=(20,6)))
 
In [65]:
print '\tAge to outcome in days \n',X['AgeuponOutcome'].describe(),'\n'
outcome_age = pan.crosstab(X['AgeuponOutcome'], X['OutcomeType'])
%matplotlib inline
plt.show(outcome_age.plot(kind="bar", title = 'Age to Outcome (days)',figsize=(20,10)))
	Age to outcome in days 
count    26728.000000
mean       794.874963
std       1081.925494
min          0.000000
25%         61.000000
50%        365.000000
75%       1095.000000
max       7300.000000
Name: AgeuponOutcome, dtype: float64 

In [ ]:

The average age of a pet when it leaves the shelter is two years and two months. Idealy we would want to see a normal distribution. That would mean that all outcomes happens for any animal regardless of its age. In the graph 3 “Age to Outcome” shown in the following page we see that animal that are 60 days old have a great possibility of being adopted. People are looking for cubs to grow. When the animals are between one and two years there are three good outcomes for the Austin Shelter. They are either adopted, transfer or return to their family.   One interesting aspect of the data is that animals that get lost of escape from home and return to their owners are mostly 3 years or older. Adoptions and transfers to other shelters happens at any age. These are the two most common outcomes during the first year of life for any type of pet. Pets in bad conditions that are between one or two years old have great probabilities of dying. Adoption, which is the best possible outcome for an animal, has its peak at the second month of life and at 1 year. The trend declines as the animal grows but then it suddenly have another peak at rear 1. Etuhtanasia's peak is at two year old, followed by one year old animals. At day 30 of life many animals have the same outcome.


In [84]:
animal_age = pan.crosstab(X['AgeuponOutcome'], X['AnimalType'])
%matplotlib inline
plt.show(animal_age.plot(kind="bar", title = 'Animal type to age',figsize=(20,6)))
In [ ]:

Cats are more likely to be adopted when they are 6 months old. From the outcome to age analysis we learn that adoption's peak is at day 60. From the graph Animal type to age, we add the information that adoption is more likely for cats, even when they are less in quantity. From the exploratory analysis we have found out that the animal with more probabilities to be adopted in 2 months is a neutered  male cat followed by a spayed female cat. On the other hand an intact male dogs are more likely to be euthanised, followed by intac female dogs.At day 30 many animals are euthanized, and the greates probability is that is a intact male cat.

In [85]:
age_sex = pan.crosstab(X['AgeuponOutcome'], X['SexuponOutcome'])
%matplotlib inline
plt.show(age_sex.plot(kind="bar", title = 'Animal type to age',figsize=(20,6)))

Decision trees

In [39]:
''' Testing for max number of features at each split'''
acc_min_samples_split=[]
for i in range(10, 1000,10):
    treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=i,max_leaf_nodes=12,min_samples_split= 100, 
                                          max_depth = int(round(len(Xtrain)**.5)),random_state=random_state)
    treeclf = treeclf.fit(Xtrain, ytrain['OutcomeType'])
    treeTest = treeclf.predict(Xtrain)
    acc_min_samples_split.append(treeclf.score(Xtest, ytest['OutcomeType']))
In [25]:
i = [z for z in range(10, 1000,10)]
%matplotlib inline
plt.plot(i,acc_min_samples_split)
plt.xlabel("number of features at each split")
plt.ylabel("Accuracy rate")
plt.title('Accuracy by number of features')
plt.show()
In [22]:
'''Best accuracy is arround 400 features per split. The model would be as follows'''
treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=400,max_leaf_nodes=12,min_samples_split= 500, 
                                          max_depth = int(round(len(Xtrain)**.5)),random_state=random_state)
treeclf = treeclf.fit(Xtrain, ytrain['OutcomeType'])
treeTest = treeclf.predict(Xtrain)
print classification_report(ytrain['OutcomeType'], treeTest)
print 'Confusion Matrix'
print confusion_matrix(ytrain['OutcomeType'], treeTest)    
print 'Accuracy ',treeclf.score(Xtest, ytest['OutcomeType'])*100
             precision    recall  f1-score   support

   Adoption       0.66      0.85      0.75      8586
       Died       0.80      0.05      0.10       155
 Euthanasia       0.49      0.10      0.17      1238
Return_to_owner       0.53      0.42      0.47      3838
   Transfer       0.73      0.68      0.70      7565

avg / total       0.65      0.67      0.64     21382

Confusion Matrix
[[7331    1    7  591  656]
 [  17    8    2    5  123]
 [ 233    0  126  287  592]
 [1660    0   42 1606  530]
 [1812    1   79  521 5152]]
Accuracy  62.6636737748
In [86]:
treeclf = tree.DecisionTreeClassifier(criterion='gini',max_features=400,max_leaf_nodes=12,min_samples_split= 500, 
                                          max_depth = int(round(len(Xtest)**.5)),random_state=random_state)
tree_scores = cross_validation.cross_val_score(treeclf, Xtest, ytest['OutcomeType'], cv=10)
print("Decision trees overall Accuracy: %0.2f (+/- %0.2f)" % (tree_scores.mean(), tree_scores.std() * 2))
Decision trees overall Accuracy: 0.60 (+/- 0.03)
In [ ]:
'''This model has the best accuracy, however it does a bad prediciton for  animals who eventually die and are ruthananized.
The best metrics are for addoption and transfer This are the most common outcomes. FOr future work implementing Adaboost for those
hard cases could be a good approach since it weights more on hard cases'''

Linear Discriminat Analysis LDA

In [94]:
lda = LDA()
lda = lda.fit(Xtrain, ytrain['OutcomeType'])
lda_scores = cross_validation.cross_val_score(lda, Xtest, ytest['OutcomeType'], cv=10) 
print("LDA overall Accuracy: %0.2f (+/- %0.2f)" % (lda_scores.mean(), lda_scores.std() * 2))
LDA overall Accuracy: 0.60 (+/- 0.04)

Naive Bayes

In [95]:
nbclf = naive_bayes.GaussianNB()
nb_scores = cross_validation.cross_val_score(nbclf, Xtest, ytest['OutcomeType'], cv=10) 

print("Naibe Bayes overall Accuracy: %0.2f (+/- %0.2f)" % (nb_scores.mean(), nb_scores.std() * 2))
Naibe Bayes overall Accuracy: 0.14 (+/- 0.02)

Random Forest

In [8]:
random_state=33

Params = [
    ("RandomForestClassifier, max_features='sqrt'",
        RFC(warm_start=True, oob_score=True,max_features="sqrt",criterion='gini',random_state=random_state)),
    ("RandomForestClassifier, max_features='log2'",
        RFC(warm_start=True, max_features='log2',oob_score=True,criterion='gini',random_state=random_state)),
    ("RandomForestClassifier, max_features=None",
        RFC(warm_start=True, max_features=None,oob_score=True,criterion='gini',random_state=random_state))]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in Params)

# Range of `n_estimators` values to explore.
min_estimators = int(len(Xtrain.columns)**(1/2.))
max_estimators = int(len(Xtrain.columns)**(1/2.)*1.7)


for label, clf in Params:
    for i in range(min_estimators, max_estimators + 1):
        #print i
        clf.set_params(n_estimators=i)
        clf.fit(Xtrain, ytrain['OutcomeType'])
        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
In [12]:
Image('RFC OOB.png')
Out[12]:
In [109]:
print("Random Forest overall Accuracy: %0.2f (+/- %0.2f)" % (np.mean(ys),np.std(ys) * 2))
print 'Best Random Forest Model Accuracy: %0.2f ' % (1-max(ys))
Random Forest overall Accuracy: 0.40 (+/- 0.00)
Best Random Forest Model Accuracy: 0.60