# pause after each complete plot (after several plots if mfrow is used)
par(ask=T)
#--------------------------
pause <- function(msg="") {
	cat("Return...[",msg,"]","\n")
	readline()
}
#--------------------------


# read file, space-delimited
df = read.table("happiness10.RData");

# check that data is read in properly
head(df)

# check the header labels
colnames(df)

# compute linear model
m = lm(df$happiness ~ df$income + 0)
# why not df$income ~ df$happiness? 

# plot the data
plot(df$income, df$happiness)
abline(m, col='red')  # overlay regression line

# correlation coefficient
r = cor(df$income, df$happiness)
cat("correlation coefficient: ", r, "\n")

# what is slope and intercept? 
coefs = m$coefficients
print(coefs)
pause("after coefs")
intercept = coefs[1]
slope = coefs[2]
cat("slope= ", slope, "\n")
cat("intercept= ", intercept, "\n")
pause("after slope/intercept")

# alternative approach
# the model provides the slope and intercept
# however, it is not possible to store the slope/intercept into 
# a variable without copying it manually ourselves
print("model: ")
print(m)

# regression line (best line fit)
#   y = intercept + slope * x

# sum of residual squares
# residual = (model fit) - (measured data)
# method 1
pause("residuals: method 1")
y = df$happiness
x = df$income
model = intercept + slope * x
residuals = (model-y)^2
sum.res = sum(residuals)
cat("sum of residuals: ", sum.res, "\n")

pause("residuals: method 2")
# method 2
sum.res = sum(m$residuals^2)
print(sum.res)
cat("sum(m$residuals^2)= ", sum.res, "\n")
#-----------------------------------
vals = c(37,48,52,56)
predicted.values = intercept + slope*vals

m1 = lm(df$happiness ~ df$income + 0)

# The correlation is based on the scattered data, not on the regression line. 
# Therefore it is the same as when the regression does not contain "+0"
#----------------------------------------------------------------------