# Regression # Dennis E. Slice (c) 2012 # Income-Happiness # Fake data # Sweep the cobwebs rm(list = ls()) # Usual parms par(ask = T) # Make sure single plot par(mfrow=c(1,1)) # A function to wait for Return pause <- function() { print("Return...") readline() } # Repeat complex plots regre.plots <- function( x, y, yHat=NULL, title="", xLabel="", yLabel="") { n = length(x) minX = min(0,min(x)) # At least down to zero minY = min(0,min(y)) # At least down to zero maxX = max(x) maxY = max(c(y,yHat)) # get 'em all on plot plot( x, y, xlim = c(minX, maxX), ylim = c(minY, maxY), main=title, xlab=xLabel, ylab=yLabel) # points(rep(0, n), y, pch = 21, bg = "red", cex=0.5) meanY = mean(y) lines( c(minX, maxX), c(meanY, meanY), lty = "dotted") for (i in 1:n) { lines( c(x[i], x[i]), c(y[i], meanY), lty = "dotted") } lines(x, yHat) for (i in 1:n) { lines(c(x[i], x[i]), c(y[i], yHat[i])) } yRes = y - yHat print( paste("SS = ",sum(yRes*yRes)) ) print( paste("MSE = ",sum((yRes*yRes))/(n-1)) ) } # Read data df = read.table("happiness10.RData") x = df$income y = df$happiness # Model parameters plotTitle = "Happiness v. Income" xLbl = "X (Income)" yLbl = "Y (Happiness)" n = length(x) minX = min(x) # minimum income maxX = max(x) # maximum income # rangeX = maxX - minX # Initial plot plot( x, y, xlim = c(0, maxX), ylim = c(0, max(y)), main=plotTitle, xlab=xLbl, ylab=yLbl) pause() # Add projections of happiness onto happiness axis points(rep(0, n), y, pch = 21, bg = "red") # Show mean happiness ignoring income meanHappiness = mean(y) yHat = rep(meanHappiness,n) regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Note, sum of squared deviations / n-1 is sd(happiness) print(paste("var(y)=",var(y))) # variability in happiness ignoring income # can we reduce this variability by taking income into account? # Let's say each increase in income increases happiness by 0.07 b1.guess = 0.07 yHat = b1.guess * x regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Try model with intercept b0.guess = 2.5 b1.guess = 0.07 yHat = b0.guess + b1.guess * x regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Well, we could do this all day. Let's turn it over to # the math-stats folks to compute the answer. regre = lm(y ~ x) yHat = regre$fitted.values regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Put symbols on the fitted values pause() points(x,regre$fitted.values,pch=19) pause() points(mean(x),mean(y),pch=23,cex=2,bg="green") pause() points(rep(0,n),regre$fitted.values,pch=21,bg="red") # Now let's plot some histograms xx = c(y,yHat,regre$residuals) xRange = range(xx) par(mfrow=c(3,1)) hist(y,xlim=xRange) hist(yHat,xlim=xRange) yResid = regre$residuals hist(yResid,xlim=xRange) # SS decomposition # SS.Y = SS.YHat + SS.YResid print(y) print(yHat+yResid) print(paste("SS Total =",sum(y*y))) print(paste("SS yHat =",sum(yHat*yHat))) print(paste("SS resid =",sum(yResid*yResid))) print("NOTE: it is not usually the case that if c=a+b c^2=a^2+b^2") print("E.g., 5 = 3 + 2, but 25 != 9 + 4") stop() # Prevents execution of subsequent code. # I want to keep the code in the script, but I don't want # to execute it. Good while developing parts of complex scripts. # Here, I am just reminding myself of something to show you, # and keeping around some working code that I decided not to use. # In general, regression plotting is easier with abline() plot(x,y,xlab=xLbl,ylab=yLbl) # add regression line described in regre abline(regre) # or roll your own - ab stands for y = a + bx # abline(a,b), e.g., abline(2.3, 1.2) # Let's subtract off the mean x = x - mean(x) y = y - mean(y) regre = lm(y ~ x) yHat = regre$fitted.values regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Put symbols on the fitted values points(x,yHat,pch=19) pause() points(mean(x),mean(y),pch=23,cex=2,bg="green") pause() points(rep(0,n),yHat,pch=21,bg="red")