from datetime import datetime
print('Last run: {}'.format(datetime.now()))
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Table of Contents
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')
#import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
1. Linear regression with one variable¶
Introduction¶
In this part of this exercise, we will implement linear regression with one variable to predict profits for a food truck. Suppose you are the CEO of a restaurant franchise and are considering different cities for opening a new outlet. The chain already has trucks in various cities and you have data for profits and populations from the cities.
You would like to use this data to help you select which city to expand
to next.
The file ex1data1.txt
contains the dataset for our linear regression problem.
The first column is the population of a city and the second column is
the profit of a food truck in that city. A negative value for profit indicates a
loss.
datafile = 'data/ex1data1.txt'
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1),unpack=True) #Read in comma separated data
#show first 5 elements
cols[0,:5] #1st col: population
cols[0,:5] #2nd col: profit
cols.shape #num of row,col
#Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1]))
y = np.transpose(np.array(cols[-1:]))
m = y.size # number of training examples
#Insert the usual column of 1's into the "X" matrix
X = np.insert(X,0,1,axis=1)
X[:5]
y[:5]
2.1 Plotting the Data¶
#Plot the data to see what it looks like
plt.figure(figsize=(10,6))
plt.plot(X[:,1],y[:,0],'rx',markersize=10)
plt.grid(True) #Always plot.grid true!
#add labels
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
2.2 Gradient Descent¶
The objective of linear regression is to minimize the cost function $$ J(\theta)=\frac{1}{2m}\sum\limits_{i=1}^m {(h_{\theta}(x^{(i)}) - y^{(i)})^2 } $$ where the hypothesis $h_{\theta}(x)$ is given by the linear model $$ h_{\theta}(x) = \theta^T x = \theta_0 + \theta_1x_1 $$
Here $T$ refers to matrix transpose operation. Recall that the parameters of our model are the $\theta_j$ values. These are the values we will adjust to minimize cost $J(\theta)$. One way to do this is to use the batch gradient descent algorithm. In batch gradient descent, each iteration performs the update $$ \theta_j := \theta_j - \alpha \frac{1}{m}\sum\limits_{i=1}^m {(h_{\theta}(x^{(i)}) - y^{(i)}) }x^{(i)}_j $$ simultaneously update $\theta_j$ for all $j$. With each step of gradient descent, our parameters $\theta_j$ come closer to the optimal values that will achieve the lowest cost $J(\theta)$.
#Define a linear hypothesis function
def h(theta,X):
'''
perform dot product between:
theta: a tuple of parameters to be optimized
X: a matrix with m- rows and n- columns
'''
return np.dot(X,theta)
#Define the cost function
def computeCost(mytheta,X,y):
"""
mytheta: an n- dimensional vector of initial theta (a guess)
X: a matrix with n- columns and m- rows
y: a matrix with m- rows and 1 column
"""
return float((1./(2*m)) * np.dot((h(mytheta,X)-y).T, (h(mytheta,X)-y)))
#Test that running computeCost with 0's as theta returns 32.07:
#(theta is a vector with n rows and 1 column (if X has n features) )
initial_theta = np.zeros((X.shape[1],1)) #give an initial guess: all 1
print(computeCost(initial_theta,X,y))
iterations = 1500
alpha = 0.01
#Gradient descent minimizing routine
def GradientDescent(X, theta_start = np.zeros(2)):
"""
theta_start: an n- dimensional vector of initial theta guess
X: matrix with n- columns and m- rows
"""
theta = theta_start
jvec = [] #Used to plot cost as function of iteration
thetahistory = [] #Used to visualize the minimization path later
for iterator in range(iterations):
tmptheta = theta
jvec.append(computeCost(theta,X,y))
thetahistory.append(list(theta[:,0]))
#Simultaneously updating theta values
for j in range(len(tmptheta)):
tmptheta[j] = theta[j] - (alpha/m)*np.sum((h(initial_theta,X) - y)*np.array(X[:,j]).reshape(m,1))
theta = tmptheta
return theta, thetahistory, jvec
Convergence of Cost Function¶
#Actually run gradient descent to get the best-fit theta values
initial_theta = np.zeros((X.shape[1],1))
theta, thetahistory, jvec = GradientDescent(X,initial_theta)
#Plot the convergence of the cost function
def plotConvergence(jvec):
plt.figure(figsize=(10,6))
plt.plot(range(len(jvec)),jvec,'bo')
plt.grid(True)
plt.title("Convergence of Cost Function")
plt.xlabel("Iteration number")
plt.ylabel("Cost function")
plt.xlim([-0.05*iterations,1.05*iterations])
#plt.ylim([4,8])
plotConvergence(jvec)
plt.ylim([4,7])
Plotting Regression Line¶
#linear regression
def myfit(xval):
return theta[0] + theta[1]*xval
#Plot linear regression line
plt.figure(figsize=(10,6))
plt.plot(X[:,1],y[:,0],'rx',markersize=10,label='Training Data')
plt.plot(X[:,1],myfit(X[:,1]),'b-',label = 'Linear regression (Gradient Descent):\ny(x) = %0.2f + %0.2fx'%(theta[0],theta[1]))
plt.xlim(4,24)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.grid(True)
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
plt.legend(loc='best')
# Predict profit for a city with population of 35,000 and 70,000
print(myfit(35000))
print(myfit(70000))
2.4 Visualizing J(θ)¶
#Import necessary matplotlib tools for 3d plots
from mpl_toolkits.mplot3d import axes3d, Axes3D
from matplotlib import cm
import itertools
fig = plt.figure(figsize=(18,6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122, projection='3d')
xvals = np.arange(-10,10,.5)
yvals = np.arange(-1,4,.1)
xx, yy = np.meshgrid(xvals, yvals, indexing='xy')
Z = np.zeros((yvals.size, xvals.size))
for (i,j),v in np.ndenumerate(Z):
Z[i,j] = computeCost(np.array([[xx[i,j]], [yy[i,j]]]),X,y)
myxs, myys, myzs = [], [], []
for xval in xvals:
for yval in yvals:
myxs.append(xval)
myys.append(yval)
myzs.append(computeCost(np.array([[xval], [yval]]),X,y))
cont = ax1.contour(xx, yy, Z, np.logspace(-1, 4, 30), cmap=plt.cm.jet)
ax1.scatter(theta[0],theta[1], c='r')
ax1.set_xlabel(r'$\theta_0$',fontsize=30)
ax1.set_ylabel(r'$\theta_1$',fontsize=30)
scat = ax2.scatter(myxs,myys,myzs,c=np.abs(myzs),cmap=plt.get_cmap('jet')) #YlOrRd
#scat = ax2.plot_surface(myxs, myys, myzs, rstride=1, cstride=1, alpha=0.6, cmap=plt.cm.jet)
ax2.set_xlabel(r'$\theta_0$',fontsize=30)
ax2.set_ylabel(r'$\theta_1$',fontsize=30)
ax2.set_zlabel(r'Cost',fontsize=20)
ax2.set_title('Minimization Path Shown',fontsize=20)
ax2.plot([x[0] for x in thetahistory],[x[1] for x in thetahistory],jvec,'wo-', markeredgecolor='none', markersize=2)
#change elevation and azimuth
ax2.view_init(elev=40, azim=210)
plt.show()
#to do: make interactive, rotate plot
3. Linear Regression with Multiple Variables¶
In this part, we will implement linear regression with multiple variables to predict the prices of houses. Suppose we are selling your house and we want to know what a good market price would be. One way to do this is to first collect information on recent houses sold and make a model of housing prices.
The file ex1data2.txt
contains a training set of housing prices in Portland, Oregon. The first column is the size of the house (in square feet), the second column is the number of bedrooms, and the third column is the price
of the house.
datafile = 'data/ex1data2.txt'
#Read into the data file
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #Read in comma separated data
#show first 5 elements
cols[0,:5] #1st col: size of the house
cols[1,:5] #2nd col: number of bedrooms
cols[2,:5] #3rd col: price of the house
#Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1]))
y = np.transpose(np.array(cols[-1:]))
m = y.size # number of training examples
X[:5]
#Insert the usual column of 1's into the "X" matrix
X = np.insert(X,0,1,axis=1)
X[:5]
y[:5] #price of the house
#Quickly visualize data
plt.grid(True)
plt.xlim([-100,5000])
plt.hist(X[:,0],label = 'col1')
plt.hist(X[:,1],label = 'col2')
plt.hist(X[:,2],label = 'col3')
plt.title('Clearly we need feature normalization.')
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.legend()
3.1 Feature normalization¶
By looking at the values, note that house sizes are about 1000 times the number of bedrooms. When features differ by orders of magnitude, first performing feature scaling can make gradient descent converge much more quickly.
Normalization requires us to:
- Subtract the mean value of each feature from the dataset.
- Scale (divide) the feature values by their respective "standard deviations."
The standard deviation is a way of measuring how much variation there is in the range of values of a particular feature (most data points will lie within $\pm$2 standard deviations of the mean); this is an alternative to taking the range
of values (max-min). We can use np.std
to get the standard deviation.
#Feature normalizing the columns (subtract mean, divide by standard deviation)
#Store the mean and std for later use
stored_feature_means, stored_feature_stds = [], []
#Note don't modify the original X matrix, use a copy
Xnorm = X.copy()
for icol in range(Xnorm.shape[1]):
stored_feature_means.append(np.mean(Xnorm[:,icol]))
stored_feature_stds.append(np.std(Xnorm[:,icol]))
#Skip the first column
if not icol: continue
#Faster to not recompute the mean and std again, just used stored values
Xnorm[:,icol] = (Xnorm[:,icol] - stored_feature_means[-1])/stored_feature_stds[-1]
#Quickly visualize the feature-normalized data
plt.grid(True)
plt.xlim([-5,5])
plt.hist(Xnorm[:,0],label = 'col1')
plt.hist(Xnorm[:,1],label = 'col2')
plt.hist(Xnorm[:,2],label = 'col3')
plt.title('Normalized Features')
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.legend()
3.2. Gradient Descent¶
Previously, we implemented gradient descent on a univariate regression problem. The only difference now is that there is one more feature in the matrix X
. The hypothesis function and the batch gradient descent update rule remain unchanged.
The normalized-feature vector Xnorm
and parameters initial_theta
are inputted to GradientDescent()
to implement the cost function and gradient descent for linear regression with multiple variables.
#Run gradient descent with multiple variables, initial theta still set to zeros
initial_theta = np.zeros((Xnorm.shape[1],1))
theta2, thetahistory2, jvec2 = GradientDescent(Xnorm,initial_theta)
#Plot convergence of cost function:
plotConvergence(jvec2)
def myfit(xval):
return theta2[0] + theta2[1]*xval
Prediction of price of house with 1650 square feet and 3 bedrooms
ytest = np.array([1650.,3.])
#To "undo" feature normalization, we "undo" 1650 and 3, then plug it into our hypothesis
ytestscaled = [(ytest[x]-stored_feature_means[x+1])/stored_feature_stds[x+1] for x in range(len(ytest))]
ytestscaled.insert(0,1)
#print "Final result theta parameters: \n",theta
print("What is price of house with 1650 square feet and 3 bedrooms?")
print("$%0.2f" % float(h(theta2,ytestscaled)))
3.3 Normal Equations¶
The closed-form solution to linear regression is $$ \theta = (X^TX)^{-1}X^Ty $$
Using this formula does not require any feature scaling, and we will get an exact solution in one calculation: there is no "loop until convergence" like in gradient descent.
from numpy.linalg import inv
#Implementation of normal equation to find analytic solution to linear regression
def normEqtn(X,y):
#restheta = np.zeros((X.shape[1],1))
return np.dot(np.dot(inv(np.dot(X.T,X)),X.T),y)
print("Normal equation prediction for price of house with 1650 square feet and 3 bedrooms:")
print("$%0.2f" % float(h(normEqtn(X,y),[1,1650.,3])))
We find that gives about the same predicted price as the value we obtained using the model fit with gradient descent.
In our calculation, their difference is
float(h(theta2,ytestscaled)) - float(h(normEqtn(X,y),[1,1650.,3]))
Appendix¶
Comparison with scikit-learn + pandas¶
Using sklearn.linear_model.LinearRegression is a quick way of doing regression. See another examples here and here.
import pandas as pd
#load data as pandas dataframe
df = pd.read_csv('data/ex1data1.txt', delimiter=',', names=['population','profit'], dtype='float64')
df.head()
import pandas as pd
#linear regression
def myfit(xval):
return theta[0] + theta[1]*xval
x=df.population.values
X=x.reshape(-1,1)
y=df.profit.values
#Plot linear regression line
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax1 = df.plot('population', 'profit', kind='scatter',label='Training Data', ax=ax) #markersize=10,
ax2 = ax.plot(x, myfit(x), color='c',label = 'Linear regression (Gradient Descent):\ny(x) = %0.2f + %0.2fx'%(theta[0],theta[1]))
### Compare with Scikit-learn Linear regression
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
#regr.fit(X[:,1].reshape(-1,1), y.ravel())
regr.fit(X, y)
predicted = regr.intercept_+regr.coef_*x
ax.plot(x, predicted, 'r-', label='Linear regression (Scikit-learn GLM):\ny(x) = %0.2f + %0.2fx'%(regr.intercept_,regr.coef_))
#plt.grid(True)
ax.set_xlim(4,24)
ax.set_ylabel('Profit in $10,000s')
ax.set_xlabel('Population of City in 10,000s')
plt.legend(loc='best')
# Predict profit for a city with population of 35,000 and 70,000 using sklearn.predict
vals = [35000, 70000]
for i in vals:
print(regr.predict(i))
# coefficient of determination R**2
R2=regr.score(X,y)
print('R^2={}'.format(R2))
R^2 suggests that the regression line does not approximate the real data points quite well. Let's try sklearn's cross_val_predict
to visualize prediction errors.
As shown below, the prediction of cross_val_predict
does not strictly follow a straight line i.e. with small offsets.
import pandas as pd
#linear regression
def myfit(xval):
return theta[0] + theta[1]*xval
x=df.population.values
X=x.reshape(-1,1)
y=df.profit.values
#Plot linear regression line
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax1 = df.plot('population', 'profit', kind='scatter',label='Training Data', ax=ax) #markersize=10,
ax2 = ax.plot(x, myfit(x), 'c-',label = 'Linear regression (Gradient Descent)')
### Compare with Scikit-learn Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
regr = LinearRegression()
regr.fit(X, y)
predicted_crossval = cross_val_predict(regr, X, y, cv=10)
ax.plot(x, predicted_crossval, 'ro', label='Linear Regression (Scikit-learn GLM)\n with cross-validation')
#ax.plot([x.min(), x.max()], [predicted_crossval.min(), predicted_crossval.max()], 'r-',label='Linear Regression (Scikit-learn GLM)\n with cross-validation')
#:\ny(x) = %0.2f + %0.2fx'%(regr.intercept_,regr.coef_)
#plt.grid(True)
ax.set_xlim(4,24)
ax.set_ylabel('Profit in $10,000s')
ax.set_xlabel('Population of City in 10,000s')
plt.legend(loc='best')
Let's see if the quality of our prediction has improved with the use of cross-validation.
from sklearn import metrics
# coefficient of determination R**2
R2=regr.score(X,y) #without cross-validation
#R2 = metrics.r2_score(y, predicted)
R2_crossval = metrics.r2_score(y, predicted_crossval) #with cross-validation
print('R^2={0}\nR^2 (with cross-validation)={1} '.format(R2, R2_crossval))
Comparison with seaborn + pandas¶
Using seaborn.regplot is a quick way of visualizing regression plots.
### Compare with seaborn + Linear regression
import seaborn as sb
sb.regplot(x="population", y="profit", data=df) #order=1, robust=True, ci=None, x_estimator=np.mean
Generating mock data¶
To reproduce the original data in the first part of this exercise, we need the mean and covariance of the model of the original data. We suppose that the data is described by a multivariate normal distribution.
#import statmodels as sm
import pandas.stats as stats
df.describe()
import numpy as np
#get the mean
xmean = np.mean(df.population.values)
ymean = np.mean(df.profit.values)
print(xmean,ymean)
#get the covariance
df.cov()
import numpy as np
np.random.seed(8) #for reproducibility
mean = [xmean, ymean]
cov = [df.cov().values[0], df.cov().values[1]]
num_datapoints=len(df)#97
x, y = np.random.multivariate_normal(mean, cov, num_datapoints).T
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,6))
ax3 = sb.regplot(x=df.population, y=df.profit, color="b", label='Original Data', ax=ax) #markersize=10,
ax4 = sb.regplot(x=x, y=y, color="g", label='Mock Data', ax=ax)
plt.legend(loc=2) #'best'
The model can be better represented with a distribution which is slightly skewed and has a long tail as shown below.
import seaborn as sb
sb.jointplot(x="population", y="profit", data=df, kind="reg")