setwd("~/Desktop/DH project/Conservation NGOs/plaintext WWF files")
files <- list.files(pattern=".*txt")
files
text.v <- unname(sapply(files, readLines))
text.lower <- tolower(text.v)
text.words.1 <- strsplit(text.lower, "\\W")
text.words.v <- unlist (text.words.1)
length(text.words.v)
not.blanks.v <- which(text.words.v!="")
text.words.v <- text.words.v[not.blanks.v]
text.words.v <- unlist(text.words.v)
words.freqs.t <- table(text.words.v)
sorted.word.freqs.t <- sort(words.freqs.t, decreasing=TRUE)
sorted.rel.word.freqs.t <- 100*(sorted.word.freqs.t/sum(sorted.word.freqs.t))
sorted.word.freqs.t["educational"]
sorted.rel.word.freqs.t["beef"]
sorted.rel.word.freqs.t["timber"]
tropical <- sorted.rel.word.freqs.t["tropics"]+sorted.rel.word.freqs.t["tropical"]+sorted.rel.word.freqs.t["subtropical"]+sorted.rel.word.freqs.t["neotropical"]+sorted.rel.word.freqs.t["neotropics"]
temperate <- sorted.rel.word.freqs.t["temperate"]
boreal <- sorted.rel.word.freqs.t["boreal"]
tropical
temperate
boreal
# now make a barplot of the relative frequencies of all the zones
zone.hits <- tropical+temperate+boreal
zone.hits
zones.m <- cbind(tropical, temperate, boreal)
colnames(zones.m) <- c("Tropical", "Temperate", "Boreal")
barplot(zones.m, beside=T, col="dark blue", xlab="Relative Frequency of Zone Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1), width= c(1,1,1))
# Made barplot above, and it's awesome
# Below is barplot of expected occurrence vs. observed occurrence. The expected is calculated in an excel spreadsheet and is observed total of TROP, TEMP, and BOR occurrences times the percentage map area of each.
trop.exp <- 0.080963359
temp.exp <- 0.10579375
bor.exp <- 0.016770691
zones.obs.exp.m <- cbind(tropical, trop.exp, temperate, temp.exp, boreal, bor.exp)
colnames(zones.obs.exp.m) <- c("Trop Obs", "Trop Exp", "Temp Obs", "Temp Exp" , "Bor Obs", "Bor Exp")
barplot(zones.obs.exp.m, beside=T, col= c("dark blue","sky blue", "dark blue", "sky blue", "dark blue", "sky blue"), xlab="Relative Frequency of Zone Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1))
africa.t <- sorted.rel.word.freqs.t["africa"]+sorted.rel.word.freqs.t["african"]
australia.t <- sorted.rel.word.freqs.t["australia"]+sorted.rel.word.freqs.t["australian"]
europe.t <- sorted.rel.word.freqs.t["europe"]+sorted.rel.word.freqs.t["european"]
asia.t <- sorted.rel.word.freqs.t["asia"]+sorted.rel.word.freqs.t["asian"]
antarctica.t <- sorted.rel.word.freqs.t["antarctic"]+sorted.rel.word.freqs.t["antarctica"]
arctic.t <- sorted.rel.word.freqs.t["arctic"]
america.positions.v <- which(text.words.v=="america")
length(america.positions.v)
#marked all the positions of the word "america"
before.america <- array(dim=c(length(america.positions.v), 3))
for (i in 1:length(america.positions.v)) {
before.america[i,] <- c(text.words.v[america.positions.v[i] - c(1:0)], text.words.v[america.positions.v[i] + c(1:1)])
}
north.america.v <- which(before.america[,1]=="north")
rel.freq.north.am <- 100*length(north.america.v)/length(text.words.v)
rel.freq.north.am
#found the word immediately before and immediately after "america," and counted up the instances of "north." Then turned that into a relative frequency of "north america."
american.positions.v <- which(text.words.v=="american")
length(american.positions.v)
#marked all the positions of the word "american" with an "n"
before.american <- array(dim=c(length(american.positions.v), 3))
for (i in 1:length(american.positions.v)) {
before.american[i,] <- c(text.words.v[american.positions.v[i] - c(1:0)], text.words.v[american.positions.v[i] + c(1:1)])
}
north.american.v <- which(before.american[,1]=="north")
rel.freq.north.amn <- 100*length(north.american.v)/length(text.words.v)
rel.freq.north.amn
north.america.v <- rel.freq.north.am+rel.freq.north.amn
# now have the combined rel freqs of "north america" and "north american"
# now for the same with "south" and "latin" america/american
south.america.v <- which(before.america[,1]=="south")
rel.freq.south.am <- 100*length(south.america.v)/length(text.words.v)
rel.freq.south.am
# now grabbing all americaN
south.american.v <- which(before.american[,1]=="south")
rel.freq.south.amn <- 100*length(south.american.v)/length(text.words.v)
rel.freq.south.amn
# now do the same for "latin" and add it
latin.america.v <- which(before.america[,1]=="latin")
rel.freq.latin.am <- 100*length(latin.america.v)/length(text.words.v)
rel.freq.latin.am
# now grabbing all americaN
latin.american.v <- which(before.american[,1]=="latin")
rel.freq.latin.amn <- 100*length(latin.american.v)/length(text.words.v)
rel.freq.latin.amn
latin.south.america.v <- rel.freq.latin.am+rel.freq.latin.amn+rel.freq.south.am+rel.freq.south.amn
#let's get all my rel freqencies together.
africa.t
australia.t
europe.t
asia.t
antarctica.t
arctic.t
north.america.v
latin.south.america.v
# now make a barplot of the relative frequencies of all the continents
continents.m <- cbind(africa.t, europe.t, asia.t, australia.t, arctic.t, latin.south.america.v, north.america.v, antarctica.t)
colnames(continents.m) <- c("Africa", "Europe", "Asia", "Austrailia", "Arctic", "S.America", "N.America", "Antarctica")
barplot(continents.m, beside=T, col="dark green", xlab="Relative Frequency of Continent Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1,1,1))
# Made barplot above, and it's awesome
# Now making a barplot of all of them with their observed vs. expected frequencies.
ant.exp <- 0.043828488
s.am.exp <-0.05909459
africa.exp <-0.099968348
austr.exp <-0.025607656
asia.exp <-0.147736475
n.am.exp <-0.080270151
europe.exp <-0.032994479
arctic.exp <-4.92455E-05
continents.obs.exp.m <- cbind(africa.t, africa.exp, europe.t, europe.exp, asia.t, asia.exp, australia.t, austr.exp, arctic.t, arctic.exp, latin.south.america.v, s.am.exp, north.america.v, n.am.exp, antarctica.t, ant.exp)
colnames(continents.obs.exp.m) <- c("Africa"," ", "Europe", " ", "Asia", " ", "Austrailia", " ", "Arctic"," ", "S.America", " ", "N.America", " ", "Antarctica", " ")
barplot(continents.obs.exp.m, beside=T, col= c("dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green","dark green", "light green"), xlab="Relative Frequency of Continent Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), cex.names=0.9)
legend(15, 0.15, c("Observed", "Expected"), fill= c("dark green", "light green"))
# printing keywords in context, not from file, but from vector in which you have already loaded your entire corpus.
positions.v <- which(text.words.v=="education")
context <-5
for(i in 1:length(positions.v)){
start <- positions.v[i]-context
end <- positions.v[i]+context
cat(text.words.v[start:end], "\n")
}