setwd("~/Desktop/DH project/Conservation NGOs/plaintext WWF files")

files <- list.files(pattern=".*txt")

files

text.v <- unname(sapply(files, readLines))

text.lower <- tolower(text.v)

text.words.1 <- strsplit(text.lower, "\\W")

text.words.v <- unlist (text.words.1)

length(text.words.v)

not.blanks.v <- which(text.words.v!="")

text.words.v <- text.words.v[not.blanks.v]

text.words.v <- unlist(text.words.v)

words.freqs.t <- table(text.words.v)

sorted.word.freqs.t <- sort(words.freqs.t, decreasing=TRUE)

sorted.rel.word.freqs.t <- 100*(sorted.word.freqs.t/sum(sorted.word.freqs.t))

 

sorted.word.freqs.t["educational"]

sorted.rel.word.freqs.t["beef"]

sorted.rel.word.freqs.t["timber"]

 

 

tropical <- sorted.rel.word.freqs.t["tropics"]+sorted.rel.word.freqs.t["tropical"]+sorted.rel.word.freqs.t["subtropical"]+sorted.rel.word.freqs.t["neotropical"]+sorted.rel.word.freqs.t["neotropics"]

temperate <- sorted.rel.word.freqs.t["temperate"]

boreal <- sorted.rel.word.freqs.t["boreal"]

tropical

temperate

boreal

# now make a barplot of the relative frequencies of all the zones

zone.hits <- tropical+temperate+boreal

zone.hits

zones.m <- cbind(tropical, temperate, boreal)

colnames(zones.m) <- c("Tropical", "Temperate", "Boreal")

barplot(zones.m, beside=T, col="dark blue", xlab="Relative Frequency of Zone Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1), width= c(1,1,1))

# Made barplot above, and it's awesome

 

# Below is barplot of expected occurrence vs. observed occurrence.  The expected is calculated in an excel spreadsheet and is observed total of TROP, TEMP, and BOR occurrences times the percentage map area of each.

trop.exp <- 0.080963359

temp.exp <- 0.10579375

bor.exp <- 0.016770691

zones.obs.exp.m <- cbind(tropical, trop.exp, temperate, temp.exp, boreal, bor.exp)

colnames(zones.obs.exp.m) <- c("Trop Obs", "Trop Exp", "Temp Obs", "Temp Exp" , "Bor Obs", "Bor Exp")

barplot(zones.obs.exp.m, beside=T, col= c("dark blue","sky blue", "dark blue", "sky blue", "dark blue", "sky blue"), xlab="Relative Frequency of Zone Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1))

 

africa.t <- sorted.rel.word.freqs.t["africa"]+sorted.rel.word.freqs.t["african"]

australia.t <- sorted.rel.word.freqs.t["australia"]+sorted.rel.word.freqs.t["australian"]

europe.t <- sorted.rel.word.freqs.t["europe"]+sorted.rel.word.freqs.t["european"]

asia.t <- sorted.rel.word.freqs.t["asia"]+sorted.rel.word.freqs.t["asian"]

antarctica.t <- sorted.rel.word.freqs.t["antarctic"]+sorted.rel.word.freqs.t["antarctica"]

arctic.t <- sorted.rel.word.freqs.t["arctic"]

 

 

 

america.positions.v <- which(text.words.v=="america")

length(america.positions.v)

#marked all the positions of the word "america"

before.america <- array(dim=c(length(america.positions.v), 3))

  for (i in 1:length(america.positions.v)) {

    before.america[i,] <- c(text.words.v[america.positions.v[i] - c(1:0)], text.words.v[america.positions.v[i] + c(1:1)])

  }

north.america.v <- which(before.america[,1]=="north")

rel.freq.north.am <- 100*length(north.america.v)/length(text.words.v)

rel.freq.north.am

#found the word immediately before and immediately after "america," and counted up the instances of "north."  Then turned that into a relative frequency of "north america."

american.positions.v <- which(text.words.v=="american")

length(american.positions.v)

#marked all the positions of the word "american" with an "n"

before.american <- array(dim=c(length(american.positions.v), 3))

for (i in 1:length(american.positions.v)) {

  before.american[i,] <- c(text.words.v[american.positions.v[i] - c(1:0)], text.words.v[american.positions.v[i] + c(1:1)])

}

north.american.v <- which(before.american[,1]=="north")

rel.freq.north.amn <- 100*length(north.american.v)/length(text.words.v)

rel.freq.north.amn

north.america.v <- rel.freq.north.am+rel.freq.north.amn

 

# now have the combined rel freqs of "north america" and "north american"

# now for the same with "south" and "latin" america/american

south.america.v <- which(before.america[,1]=="south")

rel.freq.south.am <- 100*length(south.america.v)/length(text.words.v)

rel.freq.south.am

# now grabbing all americaN

south.american.v <- which(before.american[,1]=="south")

rel.freq.south.amn <- 100*length(south.american.v)/length(text.words.v)

rel.freq.south.amn

# now do the same for "latin" and add it

latin.america.v <- which(before.america[,1]=="latin")

rel.freq.latin.am <- 100*length(latin.america.v)/length(text.words.v)

rel.freq.latin.am

# now grabbing all americaN

latin.american.v <- which(before.american[,1]=="latin")

rel.freq.latin.amn <- 100*length(latin.american.v)/length(text.words.v)

rel.freq.latin.amn

latin.south.america.v <- rel.freq.latin.am+rel.freq.latin.amn+rel.freq.south.am+rel.freq.south.amn

 

 

#let's get all my rel freqencies together.

africa.t 

australia.t 

europe.t 

asia.t 

antarctica.t 

arctic.t 

north.america.v

latin.south.america.v

# now make a barplot of the relative frequencies of all the continents

continents.m <- cbind(africa.t, europe.t, asia.t, australia.t, arctic.t, latin.south.america.v, north.america.v, antarctica.t)

colnames(continents.m) <- c("Africa", "Europe", "Asia", "Austrailia", "Arctic", "S.America", "N.America", "Antarctica")

barplot(continents.m, beside=T, col="dark green", xlab="Relative Frequency of Continent Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1,1,1))

# Made barplot above, and it's awesome

# Now making a barplot of all of them with their observed vs. expected frequencies.

ant.exp <- 0.043828488

s.am.exp <-0.05909459

africa.exp <-0.099968348

austr.exp <-0.025607656

asia.exp <-0.147736475

n.am.exp <-0.080270151

europe.exp <-0.032994479

arctic.exp <-4.92455E-05

continents.obs.exp.m <- cbind(africa.t, africa.exp, europe.t, europe.exp, asia.t, asia.exp, australia.t, austr.exp, arctic.t, arctic.exp, latin.south.america.v, s.am.exp,  north.america.v, n.am.exp, antarctica.t, ant.exp)

colnames(continents.obs.exp.m) <- c("Africa"," ", "Europe", " ", "Asia", " ", "Austrailia", " ", "Arctic"," ", "S.America", " ",  "N.America", " ", "Antarctica", " ")

barplot(continents.obs.exp.m, beside=T, col= c("dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green", "dark green", "light green","dark green", "light green"),  xlab="Relative Frequency of Continent Names",ylab="Occurrence per 100 Words", space=c(0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1,0.1, 0.1, 0.1, 0.1), width= c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), cex.names=0.9)

legend(15, 0.15, c("Observed", "Expected"), fill= c("dark green", "light green"))

 

# printing keywords in context, not from file, but from vector in which you have already loaded your entire corpus.

positions.v <- which(text.words.v=="education")

context <-5

for(i in 1:length(positions.v)){

  start <- positions.v[i]-context

  end <- positions.v[i]+context

  cat(text.words.v[start:end], "\n")

}