# Regression
# Dennis E. Slice (c) 2012
# Income-Happiness
# Fake data

# Sweep the cobwebs
rm(list = ls())

# Usual parms
par(ask = T)
# Make sure single plot
par(mfrow=c(1,1))

# A function to wait for Return
pause <- function() {
	print("Return...")
	readline()
}

# Repeat complex plots
regre.plots <- function(	x, 
							y, 
							yHat=NULL,
							title="", 
							xLabel="",
							yLabel="") {
	n = length(x)
	minX = min(0,min(x)) # At least down to zero
	minY = min(0,min(y)) # At least down to zero
	maxX = max(x)
	maxY = max(c(y,yHat)) # get 'em all on plot
	plot(	x, 
			y, 
			xlim = c(minX, maxX), 
			ylim = c(minY, maxY), 
			main=title, 
			xlab=xLabel, 
			ylab=yLabel)
#	points(rep(0, n), y, pch = 21, bg = "red", cex=0.5)
	meanY = mean(y)
	lines(	c(minX, maxX), c(meanY, meanY), 
			lty = "dotted")
	for (i in 1:n) {
		lines(	c(x[i], x[i]), c(y[i], meanY), 
				lty = "dotted")
	}
	lines(x, yHat)
	for (i in 1:n) {
		lines(c(x[i], x[i]), c(y[i], yHat[i]))
	}
	yRes = y - yHat
	print( paste("SS = ",sum(yRes*yRes)) )
	print( paste("MSE = ",sum((yRes*yRes))/(n-1)) )
}

# Read data
df = read.table("happiness10.RData")
x = df$income
y = df$happiness

# Model parameters
plotTitle = "Happiness v. Income"
xLbl = "X (Income)"
yLbl = "Y (Happiness)"
n = length(x)
minX = min(x) # minimum income
maxX = max(x) # maximum income
# rangeX = maxX - minX

# Initial plot
plot(	x, 
		y, 
		xlim = c(0, maxX), 
		ylim = c(0, max(y)),
		main=plotTitle,
		xlab=xLbl,
		ylab=yLbl)
pause()
# Add projections of happiness onto happiness axis
points(rep(0, n), y, pch = 21, bg = "red")

# Show mean happiness ignoring income
meanHappiness = mean(y)
yHat = rep(meanHappiness,n)
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)
# Note, sum of squared deviations / n-1 is sd(happiness)
print(paste("var(y)=",var(y))) # variability in happiness ignoring income

# can we reduce this variability by taking income into account?
# Let's say each increase in income increases happiness by 0.07
b1.guess = 0.07
yHat = b1.guess * x
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)


# Try model with intercept
b0.guess = 2.5
b1.guess = 0.07
yHat = b0.guess + b1.guess * x
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)

# Well, we could do this all day. Let's turn it over to 
# the math-stats folks to compute the answer.
regre = lm(y ~ x)
yHat = regre$fitted.values
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)
				
# Put symbols on the fitted values
pause()
points(x,regre$fitted.values,pch=19)
pause()
points(mean(x),mean(y),pch=23,cex=2,bg="green")
pause()
points(rep(0,n),regre$fitted.values,pch=21,bg="red")

# Now let's plot some histograms
xx = c(y,yHat,regre$residuals)
xRange = range(xx)
par(mfrow=c(3,1))
hist(y,xlim=xRange)
hist(yHat,xlim=xRange)
yResid = regre$residuals
hist(yResid,xlim=xRange)

# SS decomposition
# SS.Y = SS.YHat + SS.YResid
print(y)
print(yHat+yResid)
print(paste("SS Total =",sum(y*y)))
print(paste("SS yHat =",sum(yHat*yHat)))
print(paste("SS resid =",sum(yResid*yResid)))
print("NOTE: it is not usually the case that if c=a+b c^2=a^2+b^2")
print("E.g., 5 = 3 + 2, but 25 != 9 + 4")

stop() # Prevents execution of subsequent code. 
# I want to keep the code in the script, but I don't want
# to execute it. Good while developing parts of complex scripts.
# Here, I am just reminding myself of something to show you,
# and keeping around some working code that I decided not to use.

# In general, regression plotting is easier with abline()
plot(x,y,xlab=xLbl,ylab=yLbl)
# add regression line described in regre
abline(regre) 
# or roll your own - ab stands for y = a + bx
# abline(a,b), e.g., abline(2.3, 1.2)

# Let's subtract off the mean
x = x - mean(x)
y = y - mean(y)
regre = lm(y ~ x)
yHat = regre$fitted.values
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)
				
# Put symbols on the fitted values
points(x,yHat,pch=19)
pause()
points(mean(x),mean(y),pch=23,cex=2,bg="green")
pause()
points(rep(0,n),yHat,pch=21,bg="red")