# Exercises for Braga's summer school

# define the right working directory
# setwd("C:/R/")

# test the proportions of fear words
# read the file
medo <-read.table("http://folk.uio.no/dssantos/cursoR/frykt.txt",header=TRUE)
medo<-read.table("medo.txt",header=TRUE)
medo
# visualising it
barplot(medo$fear)

# with the names corresponding to the numbers
barplot(medo$fear,names.arg=medo$genre)

barplot(medo$ratio)
barplot(medo$ratio,names.arg=medo$genre,ylab="percent")
barplot(medo$ratio,names.arg=medo$genre,ylab="percent",ylim=c(0,12))

# sort the dataframe (first create the ordering in a variable called "sorting"
sorting <-order(medo$fear/medo$total)
ordered_fear <- medo[sorting,]
barplot(ordered_fear$ratio, names.arg=ordered_fear$genre,ylab="%", ylim=c(0,10))

# compare the two highest
prop.test(x=c(8150,429), n=c(29821714,2193638), alternative="greater")
# the answer is: we can confifently reject the null hypothesis 
# (that the proportion is the same in the two genres)

# compare "encyclopedia" and "textbook"
prop.test(x=c(32,52), n=c(286559,426766), alternative="greater")
# the answer is: we cannot reject the null hypothesis that the 
# proportions are the same in the two genres

# one can also test the weaker hypothesis that the proportions are simply different
prop.test(x=c(32,52), n=c(286559,426766), alternative="two.sided")
# but again the answer is: we do not have evidence to reject the hypothesi

# In fact, the numbers in the "ratio"-column were not right... they were done manually, outside R, and copied...
# Better to compute them again
medo$new_ratio<-medo$fear/medo$total
summary(medo)

# What about adding "magazine" and "newspaper"?
medo[medo$genre=="magazine",2]+medo[medo$genre=="newspaper",2]
medo[medo$genre=="magazine",3]+medo[medo$genre=="newspaper",3]


#### correlation
# test the correlation among adjectives and colours
colour<-read.table("http://folk.uio.no/dssantos/cursoR/autCoresCOMPARA.txt",header=TRUE,dec=",")
adjectives<-read.table("http://folk.uio.no/dssantos/cursoR/autAdjCOMPARA.txt",header=TRUE,dec=",")

#colours<-read.table("autCoresCOMPARA.txt", header=TRUE, dec=",")
#adjectives<-read.table("autAdjCOMPARA.txt",header=TRUE,dec=",")

barplot(colours$TotalPalavras)
attach(colours)
plot(TotalPalavras, Ocorrencias)

plot(TotalPalavras, Ocorrencias,type="n")
text(TotalPalavras, Ocorrencias, as.character(Autor))

barplot(xtabs(TotalPalavras~Autor,data=colours))
# best way to present it 
barplot(xtabs(TotalPalavras~Autor,data=colours),las=2,horiz=TRUE)
barplot(xtabs(Ocorrencias~Autor,data=colours),las=2,horiz=TRUE)
barplot(xtabs(Ocorrencias/TotalPalavras~Autor,data=colours),las=2,horiz=TRUE)


# now the adjectives
barplot(xtabs(Adjetivos~Autor,data=adjectives),las=2,horiz=TRUE)
barplot(xtabs(Adjetivos/TotalPalavras~Autor,data=adjectives),las=2,horiz=TRUE)

# put together the two dataframes in one
together<-merge(colours, adjectives, by.x=c("Autor","TotalPalavras"),by.y=c("Autor","TotalPalavras"))
names(together)
names(together)[6]<-"AdjectivesPer100ThousandWords"
names(together)[4]<-"ColoursPer100ThousandWords"
attach(together)
plot(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords)

# first compute the correlation (bewteen the two samples), then
# test how reliable this correlation is
cor(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords)
cor.test(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords)
# there is enough positive correlation, but maybe not as many as expected?

# Plot both at the sametime
both<-matrix(c(Ocorrencias, Adjetivos), ncol=2)
rownames(both)<-Autor
both
barplot(t(both), las=2, beside=T, horiz=TRUE)

# and look at the relative frequencies
bothrel <-matrix(c(ColoursPer100ThousandWords, AdjectivesPer100ThousandWords), ncol=2)
rownames(bothrel)<-Autor
bothrel
barplot(t(bothrel), las=2, beside=T, horiz=TRUE, legend.text=c("colour","adj"))

# most colourful: josé de alencar
# less colorful: camilo

together[Autor=="Osman Lins",]
together[Autor=="Jorge de Sena",]
prop.test(c(106,55),c(1286,1798))
# The hypothesis that the proportions are the same in both authors can be rejected

 
#####
# investigate the distribution of colours in Condiv
cores<-read.table("C:/R/coresCondiv.txt",header=TRUE,dec=",")
boxplot(cores$saúde50PT)
dim(cores)
boxplot(cores[2,2:19])
cont<-t(cores[2,2:19])
cont
boxplot(cont)
plot(cont)
contamarelo<-t(cores[1,2:19])
contver<-t(cores[3,2:19])
plot(contver)
lines(cont)
lines(contamarelo)

# now the Vercial corpus
# a dataframe with the colours per author

coresVercial<-read.table("http://www.linguateca.pt/Diana/cursoR/AutoresCoresVercial.txt",header=TRUE)
coresVercial<-read.table("AutoresCoresVercial.txt",header=TRUE)

summary(coresVercial)
dim(coresVercial)
verc.cor<-cor(coresVercial[,2:16])
verc.dist<-dist(verc.cor)
verc.dist
verc.clust<-hclust(verc.dist)
plclust(verc.clust)

# a dataframe with the colours per literary work
coresObrasVercial<-read.table("http://www.linguateca.pt/Diana/cursoR/ObrasCoresVercial.txt",header=TRUE)
coresObrasVercial<-read.table("ObrasCoresVercial.txt",header=TRUE)

dim(coresObrasVercial)
overc.cor<-cor(coresObrasVercial[,2:16])
overc.dist<-dist(overc.cor)
overc.dist
overc.clust<-hclust(overc.dist)
plclust(overc.clust)

# factor analysis?

# plot the sizes of the whole AC/DC
contas <-read.table("C:/R/contasELC2012.txt",header=TRUE)
contas$corpus<-gsub("(.*?)-.*", "\\1", contas$id, perl=TRUE)
contas$corpus<-factor(contas$corpus)
colnames(contas)<-c("id","pal","or")
attach(contas)

tam2<-subset(contas,contas$pal < 100);
hist(tam2$pal,breaks=99);
hist(tam2$or, breaks=10);


boxplot(pal~corpus)
boxplot(or~corpus)

# other clustering, now just with 5 frequent verbs
kk<-read.table("http://www.linguateca.pt/Diana/cursoR/verbosauxkk.txt")
kk<-read.table("verbosauxkk.txt")

colnames(kk)<-c("text","ntrads","words", "types", "ser", "estar", "ter", "ficar", "haver")

# for clustering the verbs, based on the texts they occur

kkv.cor<-cor(kk[,5:9])
kkv.dist<-dist(kkv.cor)
kkv.clust<-hclust(kkv.dist)
plclust(kkv.clust)
attach(kk)
barplot(xtabs(ser/words~text,data=kk),las=2,horiz=TRUE)
barplot(c(ser,estar,ficar), las=2, beside=T, horiz=TRUE, legend.text=c("ser","estar","ficar"))
nomes<-vector(length=60)
for (i in 1:60) nomes[i]=""
for (i in 1:20) nomes[i*3-2]=as.character(kk$text[i])

barplot(c(kk[,5],kk[,6],kk[,7]), las=2, beside=T, names.arg=nomes, horiz=TRUE, legend.text=c("ser","estar","ficar"))
barplot(c(kk[,5],kk[,6],kk[,7]), col=c("green","red","blue"),las=2, beside=T, names.arg=nomes, horiz=TRUE, legend.text=c("ser","estar","ficar"))

# clustering of the texts based on these verbs

textos<-t(kk[,5:9])
textos
colnames(textos)<-kk$text
textos
kkt.cor<-cor(textos, method="spearman")
kkt.dist<-dist(kkt.cor)
kkt.clust<-hclust(kkt.dist)
plclust(kkt.clust)

library(cluster)
pltree(diana(kkt.dist))

library(ape)
oo<-nj(kk.dist)
plot(oo)


#
# preliminaries for learning
# oppagve 1.

iht <- read.table("http://folk.uio.no/dssantos/cursoR/iht.txt", header=TRUE,row.names=1)
iht<-read.table("iht.txt",header=TRUE,row.names=1)

iht.t<-t(iht)

# You will need a vector with the right classification
# I called it spr (spraak)

spr=data.frame(rownames(iht.t))
colnames(spr)<-"fil"
spr$navn<-spr$fil 
gsub("^([EKHG]).*", "\\1", spr$navn, perl=TRUE)
spr$navn<- gsub("^([EKHG]).*", "\\1", spr$navn, perl=TRUE)
spr$navn<-factor(spr$navn)

# Now you can invoke the machine learning algotrithm of your choice 

# using svm as the classification tool
library(e1071)
iht.svm=svm(iht.t,spr$navn,cross=10)
summary(iht.svm)