R code for error project — Elizabeth Callaway

setwd("~/Desktop/Error/plain text")
text.v <- scan("himalayas.txt", what="character")
text.v <- tolower(text.v)
words.1 <- strsplit(text.v, "\\W")
text.v <- unlist(words.1)
# got text, made it lower case, took out punctuation, unlisted it
himalayan.v <- which(text.v=="himalayan")
himalayas.v <- which(text.v=="himalayas")
himalaya.all <- c(himalayan.v, himalayas.v)
himalaya.all
# got all the locations of "himalayan" and "himalayas" in text.v

surrounds.himalayan <- array(dim=c(length(himalaya.all),61))
for (i in 1:length(himalaya.all)) {
surrounds.himalayan[i,] <- c(text.v[himalaya.all[i] - c(30:0)], text.v[himalaya.all[i] + c(1:30)])
}
# function that puts all the words surrounding sea level (30 before and 30 after, so 61 including the word
# "sealevel" itself) in an array with dimensions = the number of occurrences of "sealevel" X the number of
# words I choose to grab before and after the occurrence of "sealevel"
glacier.v <- which(surrounds.himalayan=="glacier")
glacier.v
# checking for the presence of a word that should surround "himalayan" a lot

# below: searching for interesting words that occurr around my word of interest.
#error, speculation, wrong, mistakenly, etc.
error.v <- which(surrounds.himalayan=="error")
errors.v <- which(surrounds.himalayan=="errors")
error.himal <- length(errors.v)+length(error.v)

mistake.v <- which(surrounds.himalayan=="mistake")
mistakes.v <- which(surrounds.himalayan=="mistakes")
mistake.himal <-length(mistake.v)+ length(mistakes.v)

overestimate.v <- which(surrounds.himalayan=="overestimate")
overestimates.v <-which(surrounds.himalayan=="overestimates")
overestimating.v <- which(surrounds.himalayan=="overestimating")
overestimated.v <- which(surrounds.himalayan=="overestimated")
overestimate.himal <- length(overestimate.v) + length(overestimates.v) + length(overestimating.v) + length(overestimated.v)

underestimate.v <- which(surrounds.himalayan=="underestimate")
underestimates.v <- which(surrounds.himalayan=="underestimates")
underestimated.v <- which(surrounds.himalayan=="underestimated")
underestimating.v <- which(surrounds.himalayan=="underestimating")
underestimate.himal <- length(underestimate.v) + length(underestimates.v) + length(underestimated.v) + length(underestimating.v)

wrongly.v <- which(surrounds.himalayan=="wrongly")
wrong.v <- which(surrounds.himalayan=="wrong")
wrong.himal <- length(wrongly.v) + length(wrong.v)

incorrect.v <- which(surrounds.himalayan=="incorrect")
incorrectly.v <- which(surrounds.himalayan=="incorrectly")
incorrect.himal <- length(incorrect.v) + length(incorrectly.v)

inaccurate.v <- which(surrounds.himalayan=="inaccurate")
inaccuracy.v <- which(surrounds.himalayan=="inaccuracy")
inaccuracies.v <- which(surrounds.himalayan=="inaccuracies")
inaccurately.v <- which(surrounds.himalayan=="inaccuracately")
inaccurate.himal <-length(inaccurate.v) + length(inaccuracy.v) + length(inaccuracies.v) + length(inaccurately.v)

failure.v <- which(surrounds.himalayan=="failure")
failure.himal <-length(failure.v)

faulty.v <- which(surrounds.himalayan=="faulty")
faulty.himal <- length(faulty.v)

false.v <- which(surrounds.himalayan=="false")
false.himal <-length(false.v)

speculation.v <- which(surrounds.himalayan=="speculation")
speculation.himal <- length(speculation.v)

blunder.v <- which(surrounds.himalayan=="blunder")
blunders.v <- which(surrounds.himalayan=="blunders")
blunder.himal <- length(blunder.v) + length(blunders.v)

misleading.v <- which(surrounds.himalayan=="misleading")
misleads.v <- which(surrounds.himalayan=="misleads")
misled.v <- which(surrounds.himalayan=="misled")
mislead.v <- which(surrounds.himalayan=="mislead")
mislead.himal <- length(misleading.v) + length(misleads.v) + length(misled.v) + length(mislead.v)

#This next bit divides up your barplot into two panes, so two graphs appear side-by-side. Use it, or not if you want to put the "himalayan" graph next to the "sea level" graph.
#par(mfrow = c(1, 2))

#now for a barplot
himalayan.m <- cbind(error.himal, mistake.himal, overestimate.himal, underestimate.himal, wrong.himal, incorrect.himal, inaccurate.himal, failure.himal, faulty.himal, false.himal, speculation.himal, blunder.himal, mislead.himal)
colnames(himalayan.m) <- c("error", "mistake", "overestimate", "underestimate", "wrong", "incorrect", "inaccurate", "failure", "faulty", "false", "speculation", "blunder", "mislead")
barplot(himalayan.m, beside=T, col="dark blue", ylab="Number of times term occurs within 60 words of 'Himalayan'", space= c(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1), width= c(1,1,1,1,1,1,1,1,1,1,1,1,1), cex.names = .9, las=2, main= "Error Terms Surrounding 'Himalayan'", ylim = c(0,120))

#now sealevel
text.u <- scan("sealevel.txt", what="character")
text.u <- tolower(text.u)
words.2 <- strsplit(text.u, "\\W")
text.u <- unlist(words.2)
# got text, made it lower case, took out punctuation, unlisted it

sealevel.u <- which(text.u=="sealevel")
sealevels.u <- which(text.u=="sealevels")
sealevels.all <- c(sealevel.u,sealevels.u)
# got all the locations of "sealevel" in text.u

surrounds.sealevel <- array(dim=c(length(sealevels.all),61))
for (i in 1:length(sealevels.all)) {
surrounds.sealevel[i,] <- c(text.u[sealevels.all[i] - c(30:0)], text.u[sealevels.all[i] + c(1:30)])
}
# function that puts all the words surrounding sea level (30 before and 30 after, so 61 including the word
# "sealevel" itself) in an array with dimensions = the number of occurrences of "sealevel" X the number of
# words I choose to grab before and after the occurrence of "sealevel"
rise.u <- which(surrounds.sealevel=="rise")
rise.u
# checking with a word that should occur in association with sea level.

error.u <- which(surrounds.sealevel=="error")
errors.u <- which(surrounds.sealevel=="errors")
error.sealevel <- length(errors.u)+length(error.u)

mistake.u <- which(surrounds.sealevel=="mistake")
mistakes.u <- which(surrounds.sealevel=="mistakes")
mistake.sealevel <-length(mistake.u)+ length(mistakes.u)

overestimate.u <- which(surrounds.sealevel=="overestimate")
overestimates.u <-which(surrounds.sealevel=="overestimates")
overestimating.u <- which(surrounds.sealevel=="overestimating")
overestimated.u <- which(surrounds.sealevel=="overestimated")
overestimate.sealevel <- length(overestimate.u) + length(overestimates.u) + length(overestimating.u) + length(overestimated.u)

underestimate.u <- which(surrounds.sealevel=="underestimate")
underestimates.u <- which(surrounds.sealevel=="underestimates")
underestimated.u <- which(surrounds.sealevel=="underestimated")
underestimating.u <- which(surrounds.sealevel=="underestimating")
underestimate.sealevel <- length(underestimate.u) + length(underestimates.u) + length(underestimated.u) + length(underestimating.u)

wrongly.u <- which(surrounds.sealevel=="wrongly")
wrong.u <- which(surrounds.sealevel=="wrong")
wrong.sealevel <- length(wrongly.u) + length(wrong.u)

incorrect.u <- which(surrounds.sealevel=="incorrect")
incorrectly.u <- which(surrounds.sealevel=="incorrectly")
incorrect.sealevel <- length(incorrect.u) + length(incorrectly.u)

inaccurate.u <- which(surrounds.sealevel=="inaccurate")
inaccuracy.u <- which(surrounds.sealevel=="inaccuracy")
inaccuracies.u <- which(surrounds.sealevel=="inaccuracies")
inaccurately.u <- which(surrounds.sealevel=="inaccuracately")
inaccurate.sealevel <-length(inaccurate.u) + length(inaccuracy.u) + length(inaccuracies.u) + length(inaccurately.u)

failure.u <- which(surrounds.sealevel=="failure")
failure.sealevel <-length(failure.u)

faulty.u <- which(surrounds.sealevel=="faulty")
faulty.sealevel <- length(faulty.u)

false.u <- which(surrounds.sealevel=="false")
false.sealevel <-length(false.u)

speculation.u <- which(surrounds.sealevel=="speculation")
speculation.sealevel <- length(speculation.u)

blunder.u <- which(surrounds.sealevel=="blunder")
blunders.u <- which(surrounds.sealevel=="blunders")
blunder.sealevel <- length(blunder.u) + length(blunders.u)

misleading.u <- which(surrounds.sealevel=="misleading")
misleads.u <- which(surrounds.sealevel=="misleads")
misled.u <- which(surrounds.sealevel=="misled")
mislead.u <- which(surrounds.sealevel=="mislead")
mislead.sealevel <- length(misleading.u) + length(misleads.u) + length(misled.u) + length(mislead.u)

#now for a sealevel barplot
sealevel.m <- cbind(error.sealevel, mistake.sealevel, overestimate.sealevel, underestimate.sealevel, wrong.sealevel, incorrect.sealevel, inaccurate.sealevel, failure.sealevel, faulty.sealevel, false.sealevel, speculation.sealevel, blunder.sealevel, mislead.sealevel)
colnames(sealevel.m) <- c("error", "mistake", "overestimate", "underestimate", "wrong", "incorrect", "inaccurate", "failure", "faulty", "false", "speculation", "blunder", "mislead")
barplot(sealevel.m, beside=T, col="dark blue", ylab="Number of times term occurs within 60 words of 'sea level'", space= c(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1), width= c(1,1,1,1,1,1,1,1,1,1,1,1,1), cex.names = .9, las=2, main= "Error Terms Surrounding 'Sea Level'", ylim = c(0,120))

#Now what about a barplot of them together?
himal.sealevel.m <- rbind(himalayan.m, sealevel.m)
himal.sealevel.m
colnames(himal.sealevel.m) <- c("error", "mistake", "overestimate", "underestimate", "wrong", "incorrect", "inaccurate", "failure", "faulty", "false", "speculation", "blunder", "mislead")
barplot(himal.sealevel.m, beside=T, col=c("sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue","sky blue","dark blue"), ylab="Number of Occurrences", space= c(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1), width= c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), cex.names = .9, las=2, main= "Error Terms within 60 Words of 'Himalayan' or 'Sea Level'")