# Exercises for Braga's summer school # define the right working directory # setwd("C:/R/") # test the proportions of fear words # read the file medo <-read.table("http://folk.uio.no/dssantos/cursoR/frykt.txt",header=TRUE) medo<-read.table("medo.txt",header=TRUE) medo # visualising it barplot(medo$fear) # with the names corresponding to the numbers barplot(medo$fear,names.arg=medo$genre) barplot(medo$ratio) barplot(medo$ratio,names.arg=medo$genre,ylab="percent") barplot(medo$ratio,names.arg=medo$genre,ylab="percent",ylim=c(0,12)) # sort the dataframe (first create the ordering in a variable called "sorting" sorting <-order(medo$fear/medo$total) ordered_fear <- medo[sorting,] barplot(ordered_fear$ratio, names.arg=ordered_fear$genre,ylab="%", ylim=c(0,10)) # compare the two highest prop.test(x=c(8150,429), n=c(29821714,2193638), alternative="greater") # the answer is: we can confifently reject the null hypothesis # (that the proportion is the same in the two genres) # compare "encyclopedia" and "textbook" prop.test(x=c(32,52), n=c(286559,426766), alternative="greater") # the answer is: we cannot reject the null hypothesis that the # proportions are the same in the two genres # one can also test the weaker hypothesis that the proportions are simply different prop.test(x=c(32,52), n=c(286559,426766), alternative="two.sided") # but again the answer is: we do not have evidence to reject the hypothesi # In fact, the numbers in the "ratio"-column were not right... they were done manually, outside R, and copied... # Better to compute them again medo$new_ratio<-medo$fear/medo$total summary(medo) # What about adding "magazine" and "newspaper"? medo[medo$genre=="magazine",2]+medo[medo$genre=="newspaper",2] medo[medo$genre=="magazine",3]+medo[medo$genre=="newspaper",3] #### correlation # test the correlation among adjectives and colours colour<-read.table("http://folk.uio.no/dssantos/cursoR/autCoresCOMPARA.txt",header=TRUE,dec=",") adjectives<-read.table("http://folk.uio.no/dssantos/cursoR/autAdjCOMPARA.txt",header=TRUE,dec=",") #colours<-read.table("autCoresCOMPARA.txt", header=TRUE, dec=",") #adjectives<-read.table("autAdjCOMPARA.txt",header=TRUE,dec=",") barplot(colours$TotalPalavras) attach(colours) plot(TotalPalavras, Ocorrencias) plot(TotalPalavras, Ocorrencias,type="n") text(TotalPalavras, Ocorrencias, as.character(Autor)) barplot(xtabs(TotalPalavras~Autor,data=colours)) # best way to present it barplot(xtabs(TotalPalavras~Autor,data=colours),las=2,horiz=TRUE) barplot(xtabs(Ocorrencias~Autor,data=colours),las=2,horiz=TRUE) barplot(xtabs(Ocorrencias/TotalPalavras~Autor,data=colours),las=2,horiz=TRUE) # now the adjectives barplot(xtabs(Adjetivos~Autor,data=adjectives),las=2,horiz=TRUE) barplot(xtabs(Adjetivos/TotalPalavras~Autor,data=adjectives),las=2,horiz=TRUE) # put together the two dataframes in one together<-merge(colours, adjectives, by.x=c("Autor","TotalPalavras"),by.y=c("Autor","TotalPalavras")) names(together) names(together)[6]<-"AdjectivesPer100ThousandWords" names(together)[4]<-"ColoursPer100ThousandWords" attach(together) plot(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords) # first compute the correlation (bewteen the two samples), then # test how reliable this correlation is cor(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords) cor.test(AdjectivesPer100ThousandWords, ColoursPer100ThousandWords) # there is enough positive correlation, but maybe not as many as expected? # Plot both at the sametime both<-matrix(c(Ocorrencias, Adjetivos), ncol=2) rownames(both)<-Autor both barplot(t(both), las=2, beside=T, horiz=TRUE) # and look at the relative frequencies bothrel <-matrix(c(ColoursPer100ThousandWords, AdjectivesPer100ThousandWords), ncol=2) rownames(bothrel)<-Autor bothrel barplot(t(bothrel), las=2, beside=T, horiz=TRUE, legend.text=c("colour","adj")) # most colourful: josé de alencar # less colorful: camilo together[Autor=="Osman Lins",] together[Autor=="Jorge de Sena",] prop.test(c(106,55),c(1286,1798)) # The hypothesis that the proportions are the same in both authors can be rejected ##### # investigate the distribution of colours in Condiv cores<-read.table("C:/R/coresCondiv.txt",header=TRUE,dec=",") boxplot(cores$saúde50PT) dim(cores) boxplot(cores[2,2:19]) cont<-t(cores[2,2:19]) cont boxplot(cont) plot(cont) contamarelo<-t(cores[1,2:19]) contver<-t(cores[3,2:19]) plot(contver) lines(cont) lines(contamarelo) # now the Vercial corpus # a dataframe with the colours per author coresVercial<-read.table("http://www.linguateca.pt/Diana/cursoR/AutoresCoresVercial.txt",header=TRUE) coresVercial<-read.table("AutoresCoresVercial.txt",header=TRUE) summary(coresVercial) dim(coresVercial) verc.cor<-cor(coresVercial[,2:16]) verc.dist<-dist(verc.cor) verc.dist verc.clust<-hclust(verc.dist) plclust(verc.clust) # a dataframe with the colours per literary work coresObrasVercial<-read.table("http://www.linguateca.pt/Diana/cursoR/ObrasCoresVercial.txt",header=TRUE) coresObrasVercial<-read.table("ObrasCoresVercial.txt",header=TRUE) dim(coresObrasVercial) overc.cor<-cor(coresObrasVercial[,2:16]) overc.dist<-dist(overc.cor) overc.dist overc.clust<-hclust(overc.dist) plclust(overc.clust) # factor analysis? # plot the sizes of the whole AC/DC contas <-read.table("C:/R/contasELC2012.txt",header=TRUE) contas$corpus<-gsub("(.*?)-.*", "\\1", contas$id, perl=TRUE) contas$corpus<-factor(contas$corpus) colnames(contas)<-c("id","pal","or") attach(contas) tam2<-subset(contas,contas$pal < 100); hist(tam2$pal,breaks=99); hist(tam2$or, breaks=10); boxplot(pal~corpus) boxplot(or~corpus) # other clustering, now just with 5 frequent verbs kk<-read.table("http://www.linguateca.pt/Diana/cursoR/verbosauxkk.txt") kk<-read.table("verbosauxkk.txt") colnames(kk)<-c("text","ntrads","words", "types", "ser", "estar", "ter", "ficar", "haver") # for clustering the verbs, based on the texts they occur kkv.cor<-cor(kk[,5:9]) kkv.dist<-dist(kkv.cor) kkv.clust<-hclust(kkv.dist) plclust(kkv.clust) attach(kk) barplot(xtabs(ser/words~text,data=kk),las=2,horiz=TRUE) barplot(c(ser,estar,ficar), las=2, beside=T, horiz=TRUE, legend.text=c("ser","estar","ficar")) nomes<-vector(length=60) for (i in 1:60) nomes[i]="" for (i in 1:20) nomes[i*3-2]=as.character(kk$text[i]) barplot(c(kk[,5],kk[,6],kk[,7]), las=2, beside=T, names.arg=nomes, horiz=TRUE, legend.text=c("ser","estar","ficar")) barplot(c(kk[,5],kk[,6],kk[,7]), col=c("green","red","blue"),las=2, beside=T, names.arg=nomes, horiz=TRUE, legend.text=c("ser","estar","ficar")) # clustering of the texts based on these verbs textos<-t(kk[,5:9]) textos colnames(textos)<-kk$text textos kkt.cor<-cor(textos, method="spearman") kkt.dist<-dist(kkt.cor) kkt.clust<-hclust(kkt.dist) plclust(kkt.clust) library(cluster) pltree(diana(kkt.dist)) library(ape) oo<-nj(kk.dist) plot(oo) # # preliminaries for learning # oppagve 1. iht <- read.table("http://folk.uio.no/dssantos/cursoR/iht.txt", header=TRUE,row.names=1) iht<-read.table("iht.txt",header=TRUE,row.names=1) iht.t<-t(iht) # You will need a vector with the right classification # I called it spr (spraak) spr=data.frame(rownames(iht.t)) colnames(spr)<-"fil" spr$navn<-spr$fil gsub("^([EKHG]).*", "\\1", spr$navn, perl=TRUE) spr$navn<- gsub("^([EKHG]).*", "\\1", spr$navn, perl=TRUE) spr$navn<-factor(spr$navn) # Now you can invoke the machine learning algotrithm of your choice # using svm as the classification tool library(e1071) iht.svm=svm(iht.t,spr$navn,cross=10) summary(iht.svm)