# Generate criminal/cambridge data set of 
# height and middle finger length for criminals and
# and height of Cambridge men. 

# Reconstructed from
# Macdonell, W. R. 1902. “On Criminal Anthropometry and 
# the Identification of Criminals.” Biometrika 1 (2) (January): 
# 177–227. doi:10.2307/2331487.

# Which is the ultimate source of R's crimtab data.
# See ?crimtab

# Clean house
rm(list=ls())

#Extract middle finger length and height values from the table 
# row/col names.
# cr.finger - criminal middle finger length in cm
# cr.height - criminal height in cm
cr.finger <- (as.numeric(rownames(crimtab)))
cr.height <- (as.numeric(colnames(crimtab)))

# Count them
cr.finger.n = length(cr.finger)
cr.height.n = length(cr.height)

# Create an empty matrix with two columns - one for height,
# one for middle finger length
cr.X = matrix(NA,0,2)
# Loop over very row (=finger length)
for (i in 1:cr.finger.n) {
	# For every row, visit every column (=height)
	for (j in 1:cr.height.n) { # True if non-zero
		# From crimtab, get the number of criminals with this
		# particular combination of finger length and height
		# If there are more than zero (>0 = TRUE), 
		if (crimtab[i,j]) {
			# Generate that many observations
			# Get the number needed
			n=crimtab[i,j]
			# Duplicate the data pair n times
			x = rep(c(cr.height[j],cr.finger[i]),n)
			# Use dim() to format it as a matrix
			dim(x) = c(2,n)
			# Add the matrix to the bottom of the current data mx.
			# Note, must transpose, t(), to get n rows by 2 columns.
			cr.X = rbind(cr.X,t(x))
		}
	}
}
# To make it look like an original, random sample, randomize
# (=shuffle) the rows. See ?sample
cr.X = cr.X[sample(dim(cr.X)[1]),]

# Create a vector indicating these are criminals
nRows = dim(cr.X)[1]
cr.X = data.frame(rep("criminal",nRows),cr.X)
# Give the columns meaningful names
colnames(cr.X) = c("source","height.cm","middle.finger.cm")

# Next, generate the height data for the Cambridge students, ca...

# First, a vector of the recorded heights
# Originally in inches 5'2" to 6'5". Convert to cm.
ca.heights = (60+seq(1.5,16.5,1)+0.5)*2.54

# A vector of number of students in each height class
# Taken from table in original paper.
ca.nPerHeight = c(4,19,24.5,40.5,84.5,123.5,139,179,138.5,108,53.5,47.5,21,12,5,0.5)

# Empty matrix
ca.X = matrix(NA,0,2)
# How many height classes?
nHeights = length(ca.heights)

# Loop over each height class
for (i in 1:nHeights) {
	# Originally included to print something to the screen for debugging
	# cat("\n",i,"\n")
	# How many do we need for the current height class
	n = ca.nPerHeight[i]
	# Generate that many with NA for finger length since we don't have that data
	x = rep( c( ca.heights[i], NA), n )
	# Dimension into a matrix
	dim(x) = c(2,n)
	# Add to the current data matrix
	ca.X = rbind(ca.X,t(x))
}

# Randomize the data so it looks original
nRows = dim(ca.X)[1]
ca.X = ca.X[sample(nRows),]
# Create vector showing source as Cambridge
ca.X = data.frame(rep("cambridge",nRows),ca.X)
# Give the columns meaningful names
colnames(ca.X) = c("source","height.cm","middle.finger.cm")

# Combine the criminal data and the cambridge data rows into 
# a single data frame.
X = rbind(cr.X,ca.X)
# Make the source column a factor
X$source = factor(X$source)

# Write that data frame to a file
write.table(X,"./criminal_cambridge.RData")