Friday, May 3, 2013

Again on polar/star/pie charts

Haven't had much time to devote to new visualisations, mostly because work and baby have taken the precedence.

But I just wanted to take a few minutes to share the latest version of the script I showed last time.

It now saves to a pdf file, but that's not the largest change. I actually included one more slice which is obtained from the others with a formula (e.g. average, or geometrical mean, in this case the sum) and is used to rank the pies accordingly. The new value is represented as a white wedge on top of everything else, with its value pasted over it. The script wraps compounds and fit only a limited number per page, then moves on to the next one



There are a lot of things which may be changed, such as having the value as a bubble at the centre rather than a new wedge, to conserve the pies' proportionality... Or just showing the 'score' in a corner of the plot, with other info at the bottom... None of this is implemented as of yet, sorry. The code commented-out shows some graphics alternatives which I did try and set aside for the moment.
Code follows:


require(ggplot2)
require(reshape)
# windows()
getRandString<-function(len=12) return(paste(sample(c(LETTERS,letters),len,replace=TRUE),collapse=''))
cpd_name_len<-8
descr_name_len<-3
# let's create a dummy set
nvars<-4; varnames <- as.character(lapply(X=rep(descr_name_len,times=nvars), FUN=getRandString)) # the number and names of the variables
ncpd<-25; cpd_x_row<-5; rows_x_page<-6; cpd_x_page<-cpd_x_row*rows_x_page
cpdnames <- as.character(lapply(X=rep(cpd_name_len,times=ncpd), FUN=getRandString)) # the number and name of item (in my case compounds)
facet_font_size<-if(max(length(cpdnames)) < 12) 8 else 4
# a matrix filled with pseudorandom gibberish
MyMatrx<-matrix(ncol=nvars,nrow=ncpd,data=sample(5, repl=T, size=ncpd*nvars))
rownames(MyMatrx)<-cpdnames; colnames(MyMatrx)<-varnames
# Reorder the matrix by sum of columns - in an attempt of plotting first 'full' pies, then emptier ones
# MyMatrx<-MyMatrx[rev(order(rowSums(MyMatrx))),] # this works, but plotting order is unaltered...
MyMatrx<-cbind(A00=exp(apply(apply(MyMatrx,c(1,2),log),1,mean)), MyMatrx) # dding the sum in...
MyMatrx<-MyMatrx[rev(order(MyMatrx[,'A00'])),] # this works, but plotting order is unaltered...
# now melt your dataframe so as to be amenable to plotting as bargraph (of which piechart are but a subset)
DF <- melt(MyMatrx, varnames=c('cpd','variable'))
DF$variable<-relevel(DF$variable, 'A00') # reorders the levels so that A00 is first
DF$main<-'component'; DF$main[DF$variable=="A00"]<-'main';
DF$cpd <- factor(DF$cpd, levels=row.names(MyMatrx))
DF<-DF[order(DF$cpd),] # reordering in the hope that it will keep all of a compound records together
# # let's now print out a series of Vlaaivis, faceted according to each compound - that is, one Vlaaivis x compound.
# p1<-ggplot(DF, aes(factor(variable), value, fill = factor(variable))) + geom_bar(width = 1, alpha=0.5) + scale_y_continuous(breaks = 0:10) + coord_polar() + labs(x = "", y = "") + opts(legend.position = "none", axis.text.y = theme_blank(), axis.ticks = theme_blank()) + facet_wrap( ~ cpd, ncol = 5)
# # It works!!! (Albeit not perfectly)
# pdf(sprintf("%s.%s","C:/Users/LucaF/Documents/My Dropbox/Vlaaivis",'pdf'), onefile=T, width=8,height=12, useDingbats=F);
# print(p1);
# # savePlot("C:/Users/LucaF/Documents/My Dropbox/Vlaaivis.png",type='png')
# dev.off()
pdf(sprintf("%s.%s",file.path(Sys.getenv("USERPROFILE"),"Documents/My Dropbox/Vlaaivis"),'pdf'), onefile=T, width=8,height=12, useDingbats=F);
cpd_starts<-seq(1,ncpd,by=cpd_x_page)
for (pagina in 1:(ceiling(ncpd/cpd_x_page))) {
inizio<-min(nrow(DF),cpd_starts[pagina]*(nvars+1)-((nvars+1)-1));
fine<-min(nrow(DF),cpd_starts[pagina]*(nvars+1)+(nvars+1)*cpd_x_page-(nvars+1))
sliceseq<-(inizio):(fine); sliceseq<-sliceseq[which(sliceseq<=nrow(DF))]
print(Slice<-DF[sliceseq,])
# let's now print out a series of Vlaaivis, faceted according to each compound - that is, one Vlaaivis x compound.
# version 1: one wedge, almost touching, black outline, opaque filling...
# p1<-ggplot(Slice, aes(factor(variable), sqrt(value), fill = factor(variable))) + geom_bar(width = .95, alpha=1, col='black') + scale_y_continuous(breaks = 0:10) + coord_polar() + labs(x = "", y = "") + opts(legend.position = "top", axis.text.x = theme_blank(), axis.text.y = theme_blank(), axis.ticks = theme_blank()) + facet_wrap( ~ cpd, ncol = cpd_x_row)
# version 2: a bit fuzzied up by different bin widths...
# p1<-ggplot(Slice, aes(factor(variable), sqrt(value), fill = factor(variable))) + geom_bar(width = jitter(rep(.9, 5), factor=10), alpha=.3) + scale_y_continuous(breaks = 0:10) + coord_polar() + labs(x = "", y = "") + opts(legend.position = "none", axis.text.y = theme_blank(), axis.ticks = theme_blank()) + facet_wrap( ~ cpd, ncol = cpd_x_row)
# version 3: fuzzied up both on the x and the y axes
# p1<-ggplot(Slice)
# for (n in 1:10) {
# p1<-p1 + geom_bar(aes(factor(variable), sqrt(jitter(value, factor=10)), fill = factor(variable)), width = jitter(rep(.9, 10), factor=10), alpha=.01)
# }
# p1<-p1 + scale_y_continuous(breaks = 0:10) + coord_polar() + labs(x = "", y = "") + opts(legend.position = "top", axis.text.x = theme_blank(), axis.text.y = theme_blank(), axis.ticks = theme_blank()) + facet_wrap( ~ cpd, ncol = cpd_x_row)
# version 4: not sure yet
Slice_main<-Slice[Slice$main=='main',]
Slice_othr<-Slice[Slice$main!='main',]
p1<-ggplot()
for (n in 1:2) {
if (n==1) p1<-p1 + geom_bar(data=Slice_main, aes(factor(variable), value), width = 1, alpha=1, fill = 'white', col='black') + geom_text(data=Slice_main, aes(x=factor(variable), y=2*sqrt(value)/3, label=round((value))), size=3)
if (n!=1) p1<-p1 + geom_bar(data=Slice_othr, aes(factor(variable), (value), fill = factor(variable)), col='black', lwd=0.1, width = 1, alpha=.5)
}
p1<-p1 + scale_y_sqrt(limits=c(0,max(DF$value))) + labs(x = "", y = "") + opts(legend.position = "top", axis.text.y = theme_blank(), axis.text.x = theme_blank(), axis.ticks = theme_blank()) + coord_polar(start=-pi/(nvars+1))
#p1<-p1 + facet_grid(cpd ~ main)
p1<-p1 + facet_wrap(~ cpd, ncol=cpd_x_row) + opts(strip.text.x = theme_text(size = facet_font_size))
# It works!!! (Albeit not perfectly)
print(p1);
}
dev.off()
source: http://grrrraphics.blogspot.hk/2012/05/again-on-polarstarpie-charts.html









polar histogram: pretty and useful

source: http://chrisladroue.com/2012/02/polar-histogram-pretty-and-useful/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+ChristopheLadroueR+%28Christophe+Ladroue+%C2%BB+R%29

Do you have tens of histograms to show but no room to put them all on the page? As I was reading this paper in Nature Genetics, I came across a simple and clever way of packing all this information in a small space: arrange them all around a circle, and add some guides to help their cross-comparison.


It didn’t look too difficult to implement in ggplot2 thanks to polar coordinates and after a busy Saturday afternoon I ended up with the following image with my data (*) (and a poster-ready pdf, after 2 seconds of prettying up with Inkscape):
The graph shows the proportion of some SNP scores (‘first’, ‘second’ and ‘third’) for a number of phenotypes, which are grouped by themes. I’m quite happy with the result. It’s pretty and useful: it’s very easy to compare one histogram with any of the other 60.
The code is still a bit rough around the edges; a few things are not terribly elegant or are hard-coded. An improved version will be shipped with our graphical package next month. In the mean-time, here it is, if you want to try it with your own data.
It returns a ggplot object containing the graph. You can either display it, with print(), save it as a pdf with ggsave(“myPlot.pdf”) or modify it with the usual ggplot2commands. I’ve called it polar histogram, which, I think, is self-explanatory. If you know how it’s actually called, please let me know. (No, I will not call it polR histogram.)
And here is some fake data to get you going:
# fake data for polarHistogram()
# Christophe Ladroue
library(plyr)
library(ggplot2)
source("polarHistogram.R")
 
# a little helper that generates random names for families and items.
randomName<-function(n=1,syllables=3){
  vowels<-c("a","e","i","o","u","y")
  consonants<-setdiff(letters,vowels)
  replicate(n,
            paste(
              rbind(sample(consonants,syllables,replace=TRUE),
                    sample(vowels,syllables,replace=TRUE)),
              sep='',collapse='')
            )
}
 
  set.seed(42)
 
  nFamily<-20
  nItemPerFamily<-sample(1:6,nFamily,replace=TRUE)
  nValues<-3
 
  df<-data.frame(
    family=rep(randomName(nFamily),nItemPerFamily),
    item=randomName(sum(nItemPerFamily),2))
 
df<-cbind(df,as.data.frame(matrix(runif(nrow(df)*nValues),nrow=nrow(df),ncol=nValues)))
 
 
  df<-melt(df,c("family","item"),variable_name="score") # from wide to long
  p<-polarHistogram(df,familyLabel=FALSE)
  print(p)

Options: Many defaults can be changed already, look at the code for the complete list. The two things you might want to change are familyLabels (logical) which displays (or not) the name of each group as well, and direction, which is either ‘inwards’ or ‘outwards’.
Coding notes: It wasn’t terribly difficult but it did take me a bit longer than expected, for a few reasons:
  1. coord_polar() doesn’t affect the orientation of geom_text() so it had to be calculated manually.
  2. You’ll notice that the label orientations change between 6 and 9 o’clock, or they would end up upside down and be difficult to read.
  3. There are some scoping issues with plyr and ggplot2 which can be a bit annoying once you encapsulate your code in a function. For example:
df<-data.frame(
  x=runif(10),
  y=runif(10))
 
z<-10
ggplot(df)+geom_point(aes(x=x+z,y=y)) # works
 
rm(z)
fakeFunction<-function(df){
  z<-10
  ggplot(df)+geom_point(aes(x=x+z,y=y))
  }
 
fakeFunction(df) # error
example code and data


R package ‘phorest‘.

Friday, April 19, 2013

Using R: Correlation heatmap with ggplot2


data(attitude)
library(ggplot2)
library(reshape2)
qplot(x=Var1, y=Var2, data=melt(cor(attitude)), fill=value, geom="tile")

So, what is going on in that short passage? cor makes a correlation matrix with all the pairwise correlations between variables (twice; plus a diagonal of ones). melt takes the matrix and creates a data frame in long form, each row consisting of id variables Var1 and Var2 and a single value. We then plot with the tile geometry, mapping the indicator variables to rows and columns, and value (i.e. correlations) to the fill colour.

from: http://www.r-bloggers.com/using-r-correlation-heatmap-with-ggplot2/