# Generate criminal/cambridge data set of # height and middle finger length for criminals and # and height of Cambridge men. # Reconstructed from # Macdonell, W. R. 1902. β€œOn Criminal Anthropometry and # the Identification of Criminals.” Biometrika 1 (2) (January): # 177–227. doi:10.2307/2331487. # Which is the ultimate source of R's crimtab data. # See ?crimtab # Clean house rm(list=ls()) #Extract middle finger length and height values from the table # row/col names. # cr.finger - criminal middle finger length in cm # cr.height - criminal height in cm cr.finger <- (as.numeric(rownames(crimtab))) cr.height <- (as.numeric(colnames(crimtab))) # Count them cr.finger.n = length(cr.finger) cr.height.n = length(cr.height) # Create an empty matrix with two columns - one for height, # one for middle finger length cr.X = matrix(NA,0,2) # Loop over very row (=finger length) for (i in 1:cr.finger.n) { # For every row, visit every column (=height) for (j in 1:cr.height.n) { # True if non-zero # From crimtab, get the number of criminals with this # particular combination of finger length and height # If there are more than zero (>0 = TRUE), if (crimtab[i,j]) { # Generate that many observations # Get the number needed n=crimtab[i,j] # Duplicate the data pair n times x = rep(c(cr.height[j],cr.finger[i]),n) # Use dim() to format it as a matrix dim(x) = c(2,n) # Add the matrix to the bottom of the current data mx. # Note, must transpose, t(), to get n rows by 2 columns. cr.X = rbind(cr.X,t(x)) } } } # To make it look like an original, random sample, randomize # (=shuffle) the rows. See ?sample cr.X = cr.X[sample(dim(cr.X)[1]),] # Create a vector indicating these are criminals nRows = dim(cr.X)[1] cr.X = data.frame(rep("criminal",nRows),cr.X) # Give the columns meaningful names colnames(cr.X) = c("source","height.cm","middle.finger.cm") # Next, generate the height data for the Cambridge students, ca... # First, a vector of the recorded heights # Originally in inches 5'2" to 6'5". Convert to cm. ca.heights = (60+seq(1.5,16.5,1)+0.5)*2.54 # A vector of number of students in each height class # Taken from table in original paper. ca.nPerHeight = c(4,19,24.5,40.5,84.5,123.5,139,179,138.5,108,53.5,47.5,21,12,5,0.5) # Empty matrix ca.X = matrix(NA,0,2) # How many height classes? nHeights = length(ca.heights) # Loop over each height class for (i in 1:nHeights) { # Originally included to print something to the screen for debugging # cat("\n",i,"\n") # How many do we need for the current height class n = ca.nPerHeight[i] # Generate that many with NA for finger length since we don't have that data x = rep( c( ca.heights[i], NA), n ) # Dimension into a matrix dim(x) = c(2,n) # Add to the current data matrix ca.X = rbind(ca.X,t(x)) } # Randomize the data so it looks original nRows = dim(ca.X)[1] ca.X = ca.X[sample(nRows),] # Create vector showing source as Cambridge ca.X = data.frame(rep("cambridge",nRows),ca.X) # Give the columns meaningful names colnames(ca.X) = c("source","height.cm","middle.finger.cm") # Combine the criminal data and the cambridge data rows into # a single data frame. X = rbind(cr.X,ca.X) # Make the source column a factor X$source = factor(X$source) # Write that data frame to a file write.table(X,"./criminal_cambridge.RData")