from datetime import datetime

print('Last run: {}'.format(datetime.now()))

Last run: 2017-01-08 19:13:30.276000

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

This is part 1 of 8 series of execises based on the famous course on Machine Learning taught by Prof. Andrew Ng at Stanford U. The course is available at Coursera.

1. Linear regression with one variable¶

Introduction¶

In this part of this exercise, we will implement linear regression with one variable to predict profits for a food truck. Suppose you are the CEO of a restaurant franchise and are considering different cities for opening a new outlet. The chain already has trucks in various cities and you have data for profits and populations from the cities.

You would like to use this data to help you select which city to expand to next. The file ex1data1.txt contains the dataset for our linear regression problem. The first column is the population of a city and the second column is the profit of a food truck in that city. A negative value for profit indicates a loss.

datafile = 'data/ex1data1.txt'
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1),unpack=True) #Read in comma separated data

#show first 5 elements
cols[0,:5] #1st col: population

array([ 6.1101,  5.5277,  8.5186,  7.0032,  5.8598])

cols[0,:5] #2nd col: profit

array([ 6.1101,  5.5277,  8.5186,  7.0032,  5.8598])

cols.shape #num of row,col

(2L, 97L)

#Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1]))
y = np.transpose(np.array(cols[-1:]))
m = y.size # number of training examples

#Insert the usual column of 1's into the "X" matrix
X = np.insert(X,0,1,axis=1)

X[:5]

array([[ 1.    ,  6.1101],
       [ 1.    ,  5.5277],
       [ 1.    ,  8.5186],
       [ 1.    ,  7.0032],
       [ 1.    ,  5.8598]])

y[:5]

array([[ 17.592 ],
       [  9.1302],
       [ 13.662 ],
       [ 11.854 ],
       [  6.8233]])

2.1 Plotting the Data¶

#Plot the data to see what it looks like
plt.figure(figsize=(10,6))
plt.plot(X[:,1],y[:,0],'rx',markersize=10)
plt.grid(True) #Always plot.grid true!

#add labels
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')

<matplotlib.text.Text at 0x7983080>

2.2 Gradient Descent¶

The objective of linear regression is to minimize the cost function $$ J(\theta)=\frac{1}{2m}\sum\limits_{i=1}^m {(h_{\theta}(x^{(i)}) - y^{(i)})^2 } $$ where the hypothesis $h_{\theta}(x)$ is given by the linear model $$ h_{\theta}(x) = \theta^T x = \theta_0 + \theta_1x_1 $$

Here $T$ refers to matrix transpose operation. Recall that the parameters of our model are the $\theta_j$ values. These are the values we will adjust to minimize cost $J(\theta)$. One way to do this is to use the batch gradient descent algorithm. In batch gradient descent, each iteration performs the update $$ \theta_j := \theta_j - \alpha \frac{1}{m}\sum\limits_{i=1}^m {(h_{\theta}(x^{(i)}) - y^{(i)}) }x^{(i)}_j $$ simultaneously update $\theta_j$ for all $j$. With each step of gradient descent, our parameters $\theta_j$ come closer to the optimal values that will achieve the lowest cost $J(\theta)$.

#Define a linear hypothesis function
def h(theta,X):
    '''
    perform dot product between:
    theta: a tuple of parameters to be optimized
    X: a matrix with m- rows and n- columns
    '''
    return np.dot(X,theta)

#Define the cost function
def computeCost(mytheta,X,y): 
    """
    mytheta: an n- dimensional vector of initial theta (a guess)
    X: a matrix with n- columns and m- rows
    y: a matrix with m- rows and 1 column
    """
    return float((1./(2*m)) * np.dot((h(mytheta,X)-y).T, (h(mytheta,X)-y)))

#Test that running computeCost with 0's as theta returns 32.07:

#(theta is a vector with n rows and 1 column (if X has n features) )
initial_theta = np.zeros((X.shape[1],1)) #give an initial guess: all 1
print(computeCost(initial_theta,X,y))

32.0727338775

iterations = 1500
alpha = 0.01

#Gradient descent minimizing routine
def GradientDescent(X, theta_start = np.zeros(2)):
    """
    theta_start: an n- dimensional vector of initial theta guess
    X: matrix with n- columns and m- rows
    """
    theta = theta_start
    jvec = [] #Used to plot cost as function of iteration
    thetahistory = [] #Used to visualize the minimization path later
    for iterator in range(iterations):
        tmptheta = theta
        jvec.append(computeCost(theta,X,y))
        thetahistory.append(list(theta[:,0]))
        #Simultaneously updating theta values
        for j in range(len(tmptheta)):
            tmptheta[j] = theta[j] - (alpha/m)*np.sum((h(initial_theta,X) - y)*np.array(X[:,j]).reshape(m,1))
        theta = tmptheta
    return theta, thetahistory, jvec

Convergence of Cost Function¶

#Actually run gradient descent to get the best-fit theta values
initial_theta = np.zeros((X.shape[1],1))
theta, thetahistory, jvec = GradientDescent(X,initial_theta)

#Plot the convergence of the cost function
def plotConvergence(jvec):
    plt.figure(figsize=(10,6))
    plt.plot(range(len(jvec)),jvec,'bo')
    plt.grid(True)
    plt.title("Convergence of Cost Function")
    plt.xlabel("Iteration number")
    plt.ylabel("Cost function")
    plt.xlim([-0.05*iterations,1.05*iterations])
    #plt.ylim([4,8])


plotConvergence(jvec)
plt.ylim([4,7])

(4, 7)

Plotting Regression Line¶

#linear regression
def myfit(xval):
    return theta[0] + theta[1]*xval

#Plot linear regression line
plt.figure(figsize=(10,6))
plt.plot(X[:,1],y[:,0],'rx',markersize=10,label='Training Data')
plt.plot(X[:,1],myfit(X[:,1]),'b-',label = 'Linear regression (Gradient Descent):\ny(x) = %0.2f + %0.2fx'%(theta[0],theta[1]))
plt.xlim(4,24)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')

plt.grid(True)
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x7e4c978>

# Predict profit for a city with population of 35,000 and 70,000
print(myfit(35000))
print(myfit(70000))

[ 40840.9844723]
[ 81685.60500807]

2.4 Visualizing J(θ)¶

#Import necessary matplotlib tools for 3d plots
from mpl_toolkits.mplot3d import axes3d, Axes3D
from matplotlib import cm
import itertools

fig = plt.figure(figsize=(18,6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122, projection='3d')

xvals = np.arange(-10,10,.5)
yvals = np.arange(-1,4,.1)

xx, yy = np.meshgrid(xvals, yvals, indexing='xy')
Z = np.zeros((yvals.size, xvals.size))
for (i,j),v in np.ndenumerate(Z):
    Z[i,j] = computeCost(np.array([[xx[i,j]], [yy[i,j]]]),X,y)
    
myxs, myys, myzs = [], [], []
for xval in xvals:
    for yval in yvals:
        myxs.append(xval)
        myys.append(yval)
        myzs.append(computeCost(np.array([[xval], [yval]]),X,y))

cont = ax1.contour(xx, yy, Z, np.logspace(-1, 4, 30), cmap=plt.cm.jet)
ax1.scatter(theta[0],theta[1], c='r')
ax1.set_xlabel(r'$\theta_0$',fontsize=30)
ax1.set_ylabel(r'$\theta_1$',fontsize=30)

scat = ax2.scatter(myxs,myys,myzs,c=np.abs(myzs),cmap=plt.get_cmap('jet')) #YlOrRd
#scat = ax2.plot_surface(myxs, myys, myzs, rstride=1, cstride=1, alpha=0.6, cmap=plt.cm.jet)

ax2.set_xlabel(r'$\theta_0$',fontsize=30)
ax2.set_ylabel(r'$\theta_1$',fontsize=30)
ax2.set_zlabel(r'Cost',fontsize=20)
ax2.set_title('Minimization Path Shown',fontsize=20)
ax2.plot([x[0] for x in thetahistory],[x[1] for x in thetahistory],jvec,'wo-', markeredgecolor='none', markersize=2)
#change elevation and azimuth
ax2.view_init(elev=40, azim=210)
plt.show()

#to do: make interactive, rotate plot

3. Linear Regression with Multiple Variables¶

In this part, we will implement linear regression with multiple variables to predict the prices of houses. Suppose we are selling your house and we want to know what a good market price would be. One way to do this is to first collect information on recent houses sold and make a model of housing prices.

The file ex1data2.txt contains a training set of housing prices in Portland, Oregon. The first column is the size of the house (in square feet), the second column is the number of bedrooms, and the third column is the price of the house.

datafile = 'data/ex1data2.txt'

#Read into the data file
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data

#show first 5 elements
cols[0,:5] #1st col: size of the house

array([ 2104.,  1600.,  2400.,  1416.,  3000.])

cols[1,:5] #2nd col: number of bedrooms

array([ 3.,  3.,  3.,  2.,  4.])

cols[2,:5] #3rd col: price of the house

array([ 399900.,  329900.,  369000.,  232000.,  539900.])

#Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1])) 
y = np.transpose(np.array(cols[-1:])) 
m = y.size # number of training examples

X[:5]

array([[  2.10400000e+03,   3.00000000e+00],
       [  1.60000000e+03,   3.00000000e+00],
       [  2.40000000e+03,   3.00000000e+00],
       [  1.41600000e+03,   2.00000000e+00],
       [  3.00000000e+03,   4.00000000e+00]])

#Insert the usual column of 1's into the "X" matrix
X = np.insert(X,0,1,axis=1)
X[:5]

array([[  1.00000000e+00,   2.10400000e+03,   3.00000000e+00],
       [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00],
       [  1.00000000e+00,   2.40000000e+03,   3.00000000e+00],
       [  1.00000000e+00,   1.41600000e+03,   2.00000000e+00],
       [  1.00000000e+00,   3.00000000e+03,   4.00000000e+00]])

y[:5] #price of the house

array([[ 399900.],
       [ 329900.],
       [ 369000.],
       [ 232000.],
       [ 539900.]])

#Quickly visualize data
plt.grid(True)
plt.xlim([-100,5000])
plt.hist(X[:,0],label = 'col1')
plt.hist(X[:,1],label = 'col2')
plt.hist(X[:,2],label = 'col3')
plt.title('Clearly we need feature normalization.')
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.legend()

<matplotlib.legend.Legend at 0x9a4bb38>

3.1 Feature normalization¶

By looking at the values, note that house sizes are about 1000 times the number of bedrooms. When features differ by orders of magnitude, first performing feature scaling can make gradient descent converge much more quickly.

Normalization requires us to:

Subtract the mean value of each feature from the dataset.
Scale (divide) the feature values by their respective "standard deviations."

The standard deviation is a way of measuring how much variation there is in the range of values of a particular feature (most data points will lie within $\pm$2 standard deviations of the mean); this is an alternative to taking the range of values (max-min). We can use np.std to get the standard deviation.

#Feature normalizing the columns (subtract mean, divide by standard deviation)
#Store the mean and std for later use
stored_feature_means, stored_feature_stds = [], []

#Note don't modify the original X matrix, use a copy
Xnorm = X.copy()
for icol in range(Xnorm.shape[1]):
    stored_feature_means.append(np.mean(Xnorm[:,icol]))
    stored_feature_stds.append(np.std(Xnorm[:,icol]))
    #Skip the first column
    if not icol: continue
    #Faster to not recompute the mean and std again, just used stored values
    Xnorm[:,icol] = (Xnorm[:,icol] - stored_feature_means[-1])/stored_feature_stds[-1]

#Quickly visualize the feature-normalized data
plt.grid(True)
plt.xlim([-5,5])
plt.hist(Xnorm[:,0],label = 'col1')
plt.hist(Xnorm[:,1],label = 'col2')
plt.hist(Xnorm[:,2],label = 'col3')
plt.title('Normalized Features')
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.legend()

<matplotlib.legend.Legend at 0x9ed6b38>

3.2. Gradient Descent¶

Previously, we implemented gradient descent on a univariate regression problem. The only difference now is that there is one more feature in the matrix X. The hypothesis function and the batch gradient descent update rule remain unchanged.

The normalized-feature vector Xnorm and parameters initial_theta are inputted to GradientDescent() to implement the cost function and gradient descent for linear regression with multiple variables.

#Run gradient descent with multiple variables, initial theta still set to zeros
initial_theta = np.zeros((Xnorm.shape[1],1))
theta2, thetahistory2, jvec2 = GradientDescent(Xnorm,initial_theta)

#Plot convergence of cost function:
plotConvergence(jvec2)

def myfit(xval):
    return theta2[0] + theta2[1]*xval

Prediction of price of house with 1650 square feet and 3 bedrooms

ytest = np.array([1650.,3.])

#To "undo" feature normalization, we "undo" 1650 and 3, then plug it into our hypothesis
ytestscaled = [(ytest[x]-stored_feature_means[x+1])/stored_feature_stds[x+1] for x in range(len(ytest))]
ytestscaled.insert(0,1)

#print "Final result theta parameters: \n",theta
print("What is price of house with 1650 square feet and 3 bedrooms?")
print("$%0.2f" % float(h(theta2,ytestscaled)))

What is price of house with 1650 square feet and 3 bedrooms?
$293098.15

3.3 Normal Equations¶

The closed-form solution to linear regression is $$ \theta = (X^TX)^{-1}X^Ty $$

Using this formula does not require any feature scaling, and we will get an exact solution in one calculation: there is no "loop until convergence" like in gradient descent.

from numpy.linalg import inv
#Implementation of normal equation to find analytic solution to linear regression
def normEqtn(X,y):
    #restheta = np.zeros((X.shape[1],1))
    return np.dot(np.dot(inv(np.dot(X.T,X)),X.T),y)

print("Normal equation prediction for price of house with 1650 square feet and 3 bedrooms:")
print("$%0.2f" % float(h(normEqtn(X,y),[1,1650.,3])))

Normal equation prediction for price of house with 1650 square feet and 3 bedrooms:
$293081.46

We find that gives about the same predicted price as the value we obtained using the model fit with gradient descent.

In our calculation, their difference is

float(h(theta2,ytestscaled)) - float(h(normEqtn(X,y),[1,1650.,3]))

16.68491264153272

Appendix¶

Comparison with scikit-learn + pandas¶

Using sklearn.linear_model.LinearRegression is a quick way of doing regression. See another examples here and here.

import pandas as pd

#load data as pandas dataframe
df = pd.read_csv('data/ex1data1.txt', delimiter=',', names=['population','profit'], dtype='float64')

df.head()

import pandas as pd

#linear regression
def myfit(xval):
    return theta[0] + theta[1]*xval

x=df.population.values
X=x.reshape(-1,1)
y=df.profit.values

#Plot linear regression line
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax1 = df.plot('population', 'profit', kind='scatter',label='Training Data', ax=ax) #markersize=10,
ax2 = ax.plot(x, myfit(x), color='c',label = 'Linear regression (Gradient Descent):\ny(x) = %0.2f + %0.2fx'%(theta[0],theta[1]))

### Compare with Scikit-learn Linear regression 
from sklearn.linear_model import LinearRegression
regr = LinearRegression()


#regr.fit(X[:,1].reshape(-1,1), y.ravel())
regr.fit(X, y)
predicted = regr.intercept_+regr.coef_*x
ax.plot(x, predicted, 'r-', label='Linear regression (Scikit-learn GLM):\ny(x) = %0.2f + %0.2fx'%(regr.intercept_,regr.coef_))

#plt.grid(True)
ax.set_xlim(4,24)
ax.set_ylabel('Profit in $10,000s')
ax.set_xlabel('Population of City in 10,000s')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0xdfa7630>

# Predict profit for a city with population of 35,000 and 70,000 using sklearn.predict
vals = [35000, 70000]

for i in vals:
    print(regr.predict(i))

[ 41752.28176576]
[ 83508.45931239]

# coefficient of determination R**2
R2=regr.score(X,y)
print('R^2={}'.format(R2))

R^2=0.702031553784

R^2 suggests that the regression line does not approximate the real data points quite well. Let's try sklearn's cross_val_predict to visualize prediction errors.

As shown below, the prediction of cross_val_predict does not strictly follow a straight line i.e. with small offsets.

import pandas as pd

#linear regression
def myfit(xval):
    return theta[0] + theta[1]*xval

x=df.population.values
X=x.reshape(-1,1)
y=df.profit.values

#Plot linear regression line
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax1 = df.plot('population', 'profit', kind='scatter',label='Training Data', ax=ax) #markersize=10,
ax2 = ax.plot(x, myfit(x), 'c-',label = 'Linear regression (Gradient Descent)')

### Compare with Scikit-learn Linear regression 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
regr = LinearRegression()

regr.fit(X, y)
predicted_crossval = cross_val_predict(regr, X, y, cv=10)
ax.plot(x, predicted_crossval, 'ro', label='Linear Regression (Scikit-learn GLM)\n with cross-validation')
#ax.plot([x.min(), x.max()], [predicted_crossval.min(), predicted_crossval.max()], 'r-',label='Linear Regression (Scikit-learn GLM)\n with cross-validation')

#:\ny(x) = %0.2f + %0.2fx'%(regr.intercept_,regr.coef_)

#plt.grid(True)
ax.set_xlim(4,24)
ax.set_ylabel('Profit in $10,000s')
ax.set_xlabel('Population of City in 10,000s')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0xe12a278>

Let's see if the quality of our prediction has improved with the use of cross-validation.

from sklearn import metrics

# coefficient of determination R**2 
R2=regr.score(X,y) #without cross-validation
#R2 = metrics.r2_score(y, predicted)
R2_crossval = metrics.r2_score(y, predicted_crossval)  #with cross-validation

print('R^2={0}\nR^2 (with cross-validation)={1} '.format(R2, R2_crossval))

R^2=0.702031553784
R^2 (with cross-validation)=0.649997941106

Comparison with seaborn + pandas¶

Using seaborn.regplot is a quick way of visualizing regression plots.

### Compare with seaborn + Linear regression 
import seaborn as sb

sb.regplot(x="population", y="profit", data=df) #order=1, robust=True, ci=None, x_estimator=np.mean

<matplotlib.axes._subplots.AxesSubplot at 0xe5ae630>

Generating mock data¶

To reproduce the original data in the first part of this exercise, we need the mean and covariance of the model of the original data. We suppose that the data is described by a multivariate normal distribution.

#import statmodels as sm
import pandas.stats as stats

df.describe()

import numpy as np

#get the mean
xmean = np.mean(df.population.values)
ymean = np.mean(df.profit.values)

print(xmean,ymean)

(8.1597999999999988, 5.8391350515463927)

#get the covariance
df.cov()

import numpy as np
np.random.seed(8) #for reproducibility

mean = [xmean, ymean]
cov = [df.cov().values[0], df.cov().values[1]]
num_datapoints=len(df)#97
x, y = np.random.multivariate_normal(mean, cov, num_datapoints).T

fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax3 = sb.regplot(x=df.population, y=df.profit, color="b", label='Original Data', ax=ax) #markersize=10,
ax4 = sb.regplot(x=x, y=y, color="g", label='Mock Data', ax=ax)
plt.legend(loc=2) #'best'

<matplotlib.legend.Legend at 0xe8d6fd0>

The model can be better represented with a distribution which is slightly skewed and has a long tail as shown below.

import seaborn as sb

sb.jointplot(x="population", y="profit", data=df, kind="reg")

<seaborn.axisgrid.JointGrid at 0xe6ce400>

	population	profit
count	97.000000	97.000000
mean	8.159800	5.839135
std	3.869884	5.510262
min	5.026900	-2.680700
25%	5.707700	1.986900
50%	6.589400	4.562300
75%	8.578100	7.046700
max	22.203000	24.147000

Machine Learning Basics