# Regression # Dennis E. Slice (c) 2012 # Income-Happiness # Fake data # Sweep the cobwebs rm(list = ls()) # Usual parms par(ask = T) # A function to wait for Return pause <- function() { print("Return...") readline() } # Repeat complex plots regre.plots <- function( x, y, yHat=NULL, title="", xLabel="", yLabel="") { n = length(x) plot( x, y, xlim = c(0, maxX), ylim = c(0, max(y)), main=title, xlab=xLabel, ylab=yLabel) # points(rep(0, n), y, pch = 21, bg = "red", cex=0.5) meanY = mean(y) lines( c(0, maxX), c(meanY, meanY), lty = "dotted") for (i in 1:n) { lines( c(x[i], x[i]), c(y[i], meanY), lty = "dotted") } lines(x, yHat) for (i in 1:n) { lines(c(x[i], x[i]), c(y[i], yHat[i])) } yRes = y - yHat print( sum((yRes*yRes))/(n-1) ) } # Model parameters plotTitle = "Happiness v. Income" xLbl = "X (Income)" yLbl = "Y (Happiness)" n = 10 # Small n for illustrative purposes b0 = 3 # non-zero intercept b1 = 0.1 # one thousand in income increases happiness by 0.1 minX = 1 # minimum income maxX = 100 # maximum income rangeX = maxX - minX # X - independent variable (free to vary) x = seq(minX, maxX, rangeX/(n - 1)) # 1000 to 100000 in income # (n-1)? This gives the number of gaps so the number of points will equal n # Y - dependent variable, completely determined by x y = b0 + b1 * x plot(x, y, main=plotTitle, xlab=xLbl, ylab=yLbl, xlim=c(0,maxX), ylim=c(0,max(y))) pause() lines(x,y) # Generate some error = variation due to anything # NOT in the model. E.g. sickness, marital status, # lying about income, etc, etc, etc,...anything! # sd=2 chosen by trial-and-error for illustrative purposes error = rnorm(n, 0, 2) y = y + error # Initial plot plot( x, y, xlim = c(0, maxX), ylim = c(0, max(y)), main=plotTitle, xlab=xLbl, ylab=yLbl) pause() # Add projections of happiness onto happiness axis points(rep(0, n), y, pch = 21, bg = "red") # Show mean happiness ignoring income meanHappiness = mean(y) yHat = rep(meanHappiness,n) regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Note, sum of squared deviations / n-1 is sd(happiness) print(paste("var(y)=",var(y))) # variability in happiness ignoring income # can we reduce this variability by taking income into account? # Let's say each increase in income increases happiness by 0.07 b1.guess = 0.07 yHat = b1.guess * x regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Try model with intercept b0.guess = 2.5 b1.guess = 0.07 yHat = b0.guess + b1.guess * x regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) # Well, we could do this all day. Let's turn it over to # the math-stats folks to compute the answer. regre = lm(y ~ x) yHat = regre$fitted.values regre.plots( x, y, yHat, title=plotTitle, xLabel=xLbl, yLabel=yLbl) stop() # In general, regression plotting is easier with abline() plot(x,y,xlab=xLbl,ylab=yLbl) # add regression line described in regre abline(regre) # or roll your own - ab stands for y = a + bx # abline(a,b), e.g., abline(2.3, 1.2)