# Regression
# Dennis E. Slice (c) 2012
# Income-Happiness
# Fake data

# Sweep the cobwebs
rm(list = ls())

# Usual parms
par(ask = T)

# A function to wait for Return
pause <- function() {
	print("Return...")
	readline()
}

# Repeat complex plots
regre.plots <- function(	x, 
							y, 
							yHat=NULL,
							title="", 
							xLabel="",
							yLabel="") {
	n = length(x)
	plot(	x, 
			y, 
			xlim = c(0, maxX), 
			ylim = c(0, max(y)), 
			main=title, 
			xlab=xLabel, 
			ylab=yLabel)
#	points(rep(0, n), y, pch = 21, bg = "red", cex=0.5)
	meanY = mean(y)
	lines(	c(0, maxX), c(meanY, meanY), 
			lty = "dotted")
	for (i in 1:n) {
		lines(	c(x[i], x[i]), c(y[i], meanY), 
				lty = "dotted")
	}
	lines(x, yHat)
	for (i in 1:n) {
		lines(c(x[i], x[i]), c(y[i], yHat[i]))
	}
	yRes = y - yHat
	print( sum((yRes*yRes))/(n-1) )
}

# Model parameters
plotTitle = "Happiness v. Income"
xLbl = "X (Income)"
yLbl = "Y (Happiness)"
n = 10 # Small n for illustrative purposes
b0 = 3 # non-zero intercept
b1 = 0.1 # one thousand in income increases happiness by 0.1
minX = 1 # minimum income
maxX = 100 # maximum income
rangeX = maxX - minX

# X - independent variable (free to vary)
x = seq(minX, maxX, rangeX/(n - 1)) # 1000 to 100000 in income
# (n-1)? This gives the number of gaps so the number of points will equal n

# Y - dependent variable, completely determined by x
y = b0 + b1 * x

plot(x, y, main=plotTitle, xlab=xLbl, ylab=yLbl, xlim=c(0,maxX), ylim=c(0,max(y)))
pause()
lines(x,y)

# Generate some error = variation due to anything
# NOT in the model. E.g. sickness, marital status,
# lying about income, etc, etc, etc,...anything!
# sd=2 chosen by trial-and-error for illustrative purposes
error = rnorm(n, 0, 2) 
y = y + error

# Initial plot
plot(	x, 
		y, 
		xlim = c(0, maxX), 
		ylim = c(0, max(y)),
		main=plotTitle,
		xlab=xLbl,
		ylab=yLbl)
pause()
# Add projections of happiness onto happiness axis
points(rep(0, n), y, pch = 21, bg = "red")

# Show mean happiness ignoring income
meanHappiness = mean(y)
yHat = rep(meanHappiness,n)
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)
# Note, sum of squared deviations / n-1 is sd(happiness)
print(paste("var(y)=",var(y))) # variability in happiness ignoring income

# can we reduce this variability by taking income into account?
# Let's say each increase in income increases happiness by 0.07
b1.guess = 0.07
yHat = b1.guess * x
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)


# Try model with intercept
b0.guess = 2.5
b1.guess = 0.07
yHat = b0.guess + b1.guess * x
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)

# Well, we could do this all day. Let's turn it over to 
# the math-stats folks to compute the answer.
regre = lm(y ~ x)
yHat = regre$fitted.values
regre.plots(	x, 
				y, 
				yHat,
				title=plotTitle, 
				xLabel=xLbl,
				yLabel=yLbl)

stop()
# In general, regression plotting is easier with abline()
plot(x,y,xlab=xLbl,ylab=yLbl)
# add regression line described in regre
abline(regre) 
# or roll your own - ab stands for y = a + bx
# abline(a,b), e.g., abline(2.3, 1.2)