setwd("~/Desktop/DH project/Conservation NGOs")
source("Functions.R")
input.dir <- "plaintext WWF files"
files.v <- dir(path=input.dir, pattern=".*txt")
files.v
book.freqs.1 <- list()
# a list object to hold the results
for(i in 1:length(files.v)){
doc.object <-scan(file.path(input.dir, files.v[i]), what="character")
#scanning and reading txt files in that folder
worddata <- getwordtablelist(doc.object)
book.freqs.1[[files.v[i]]] <- worddata
#get a frequency word table from a set of texts and put the output into a list
}
# now reading files, and using a function I wrote to break them up into segments, then labelling segments numerically and putting it into a new matrix object called "segments.m" with will be row-bound to a matrix object "topic.m"
chunk.size <- 400
# above is where you set chunk size in number of words
topic.m <- NULL
for(i in 1:length(files.v)){
doc.object <-scan(file.path(input.dir, files.v[i]), what="character")
chunk.m <- makeFlexTextChunks(doc.object, chunk.size, percentage=FALSE)
textname <- gsub("\\..*", "", files.v[i])
segments.m <- cbind(paste(textname, segment=1:nrow(chunk.m), sep="_"), chunk.m)
topic.m <- rbind(topic.m, segments.m)
}
# now convert into a dataframe instead of a matrix
documents <- as.data.frame(topic.m, stringsAsFactors=F)
colnames(documents) <- c("id", "text")
# getting data ready to work in mallet
library(mallet)
mallet.instances <- mallet.import(documents$id, documents$text, "plaintext WWF files/stoplist.csv", FALSE, token.regexp = "[\\p{L}']+")
topic.model <- MalletLDA(num.topics=25)
# above is where you set topic number
topic.model$loadDocuments(mallet.instances)
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
head(word.freqs)
topic.model$train(400)
# Running the topic model
topic.words.m <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)
# generated matrix where each row is a topic and each column a unique word in corpus
vocabulary <- topic.model$getVocabulary()
colnames(topic.words.m) <- vocabulary
topic.words.m[1:3, 1:3]
# first three words in matrix
# now looking at relative weight of specific word types based on their arranged positions
keywords <- c("europe", "european")
topic.words.m[, keywords]
# the relative weight of my keywords (the two words in quotations) in each of the topics
imp.row <- which(rowSums(topic.words.m[, keywords])==max(rowSums(topic.words.m[, keywords])))
#which of the topic rows has the highest concentration of both key terms
imp.row
mallet.top.words(topic.model, topic.words.m[imp.row,], 10)
# looking at the ranked sorting of topic words for the row with the highest concentration of both my key terms
library(wordcloud)
topic.top.words <- mallet.top.words(topic.model, topic.words.m[imp.row,], 50)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4, .8), rot.per=0, random.order=F)
# made a wordcloud for my "important" topic with 50 words.
for (i in 1:25){
topic.top.words <- mallet.top.words(topic.model, topic.words.m[i,], 20)
cat("\n-----------------------", i, "-------------------------\n")
cat(topic.top.words$words)
}
# printed 20 top words from each of the 25 topics
for (i in 1:25){
topic.top.words <- mallet.top.words(topic.model, topic.words.m[i,], 50)
print(wordcloud(topic.top.words$words, topic.top.words$weights, c(4, .8), rot.per=0, random.order=F))
}
# Made a wordcloud for each of my 25 topics
doc.topics.m <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
# made a matrix object in which each column is a topic and each row is a document from the corpus (remember "document" is a 1000 word hunk of text, not the whole report/article). The values of the cells in the matrix are the corresponding possibilities of a given topic (column) in a given document (row).
file.ids.v <- documents[,1]
head(file.ids.v)
file.id.1 <- strsplit(file.ids.v, "_")
file.chunk.id.1 <- lapply(file.id.1, rbind)
file.chunk.id.m <- do.call(rbind, file.chunk.id.1)
head(file.chunk.id.m)
# made a 2 column matrix out of the txt file title and the number assigned to it during chunking. Used the "_" as the separator for the columns, so txt file names shouldn't have underscores in them for this to work.
doc.topics.df <- as.data.frame(doc.topics.m)
# saved my doc.topics matrix as a dataframe, so that it can hold both text and numbers.
doc.topics.df <- cbind(file.chunk.id.m[,1], doc.topics.df)
# bound doc.topics dataframe (columns are topics and rows are documents from corpus) to the file.chunk.id (columns are txt file names and doc id number) each row is a document from the corpus (like the doc.topics I bound it to).
doc.topic.means.df <- aggregate(doc.topics.df[, 2:ncol(doc.topics.df)], list(doc.topics.df[,1]),mean)
barplot(doc.topic.means.df[, "V1"], names.arg=c(1:25), xlab="Texts Arranged by Date (1998-2014)", ylab="'Europe' Topic Probs")
# made a barplot of probability that topic "V_" is in each txt document.
filename <- as.character(doc.topic.means.df[25, "Group.1"])
filename
# got the filename for document 25, since the barplot showed me it's dominated by one topic.
doc.topic.means.df
# to look at a matrix of each row is one of my reports, and each column is one of the 25 topics I set it to sort them into.
topic.top.words <- mallet.top.words(topic.model, topic.words.m[11,], 50)
wordcloud(topic.top.words$words, topic.top.words$weights, c(4, .8), rot.per=0, random.order=F)
# made a wordcloud of topic 11