开发者

How to create an "inkblot" chart with R?

How can I create a chart like

http://junkcharts.typepad.com/junk_charts/2010/01/leaving-ink-traces.html

where several time series (one per country) are displayed horizontally as symmetric areas?

I think if I could display one time series in this way, it is easy to generalize to several using mfrow.

Sample data:

#Solar energy production in Europe, by country (EC),(1 000 toe)  
Country,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007  
Belgium,1,1,1,1,1,1,2,2,3,3,3,5  
Bulgaria,-,-,-,-,-,-,-,-,-,-,-,-  
Czech Republic,0,0,0,0,0,0,0,0,2,2,3,4  
Denmark,6,7,7,8,8,8,9,9,9,10,10,11  
Germany (including ex-GDR from 1991),57,70,83,78,96,150,184,216,262,353,472,580  
Estonia,-,-,-,-,-,-,-,-,-,-,-,-  
Ireland,0,0,0,0,0,0,0,0,0,0,1,1  
Greece,86,89,93,97,99,100,99,99,101,102,109,160  
Spain,26,23,26,29,33,38,43,48,58,65,83,137  
France,15,开发者_如何学运维16,17,18,26,19,19,18,19,22,29,37  
Italy,8,9,11,11,12,14,16,18,21,30,38,56  
Cyprus,32,33,34,35,35,34,35,36,40,41,43,54  
Latvia,-,-,-,-,-,-,-,-,-,-,-,-  
Lithuania,-,-,-,-,-,-,-,-,-,-,-,-  
Luxembourg (Grand-Duché),0,0,0,0,0,0,0,0,1,2,2,2  
Hungary,0,0,0,0,0,1,2,2,2,2,2,3  
Netherlands,6,7,8,10,12,14,16,19,20,22,22,23  
Austria,42,48,55,58,64,69,74,80,86,92,101,108  
Poland,0,0,0,0,0,0,0,0,0,0,0,0  
Portugal,16,16,17,18,18,19,20,21,21,23,24,28  
Romania,0,0,0,0,0,0,0,0,0,0,0,0  
Slovenia,-,-,-,-,-,-,-,-,-,-,-,-  
Slovakia,0,0,0,0,0,0,0,0,0,0,0,0  
Finland,0,0,0,0,1,1,1,1,1,1,1,1  
Sweden,4,4,5,5,5,6,4,5,5,6,6,9  
United Kingdom,6,6,7,7,11,13,16,20,25,30,37,46  
Croatia,0,0,0,0,0,0,0,0,0,0,0,1  
Turkey,159,179,210,236,262,287,318,350,375,385,402,420  
Iceland,-,-,-,-,-,-,-,-,-,-,-,-  
Norway,0,0,0,0,0,0,0,0,0,0,0,0  
Switzerland,18,19,21,23,24,26,23,24,25,26,28,30  
#-='Not applicable' or 'Real zero' or 'Zero by default' :=Not available "
#Source of Data:,Eurostat, http://spreadsheets.google.com/ccc?key=0Agol553XfuDZdFpCQU1CUVdPZ3M0djJBSE1za1NGV0E&hl=en_GB  
#Last Update:,30.04.2009  
#Date of extraction:,17 Aug 2009 07:41:12 GMT, http://epp.eurostat.ec.europa.eu/tgm/table.do?tab=table&init=1&plugin=1&language=en&pcode=ten00082


You can use polygon in base graphics, for instance

x <- seq(as.POSIXct("1949-01-01", tz="GMT"), length=36, by="months")
y <- rnorm(length(x))
plot(x, y, type="n", ylim=c(-1,1)*max(abs(y)))
polygon(c(x, rev(x)), c(y, -rev(y)), col="cornflowerblue", border=NA)

Update: Using panel.polygon from lattice:

library("lattice")
library("RColorBrewer")

df <- data.frame(x=rep(x,3),
                 y=rnorm(3*length(x)),
                 variable=gl(3, length(x)))

p <- xyplot(y~x|variable, data=df,
            ylim=c(-1,1)*max(abs(y)),
            layout=c(1,3),
            fill=brewer.pal(3, "Pastel2"),
            panel=function(...) {
              args <- list(...)
              print(args)
              panel.polygon(c(args$x, rev(args$x)),
                            c(args$y, -rev(args$y)),
                            fill=args$fill[panel.number()],
                            border=NA)
            })
print(p)


With a little polishing, this ggplot solution will look like what you want:

alt text http://www.imagechicken.com/uploads/1264790429056858700.png

Here's how to make it from your data:

require(ggplot2)

First, let's take your input data and import and restructure it into a form ggplot likes:

rdata = read.csv("data.csv", 
# options: load '-' as na, ignore first comment line #Solar,
# strip whitespace that ends line, accept numbers as col headings
  na.strings="-", skip=1, strip.white=T, check.names=F)
# Convert to long format and check years are numeric
data = melt(rdata)
data = transform(data,year=as.numeric(as.character(variable)))
# geom_ribbon hates NAs.
data = data[!is.na(data$value),]

> summary(data)
           Country       variable       value             year     
 Austria       : 12   1996   : 25   Min.   :  0.00   Min.   :1996  
 Belgium       : 12   1997   : 25   1st Qu.:  0.00   1st Qu.:1999  
 Croatia       : 12   1998   : 25   Median :  7.00   Median :2002  
 Cyprus        : 12   1999   : 25   Mean   : 36.73   Mean   :2002  
 Czech Republic: 12   2000   : 25   3rd Qu.: 30.00   3rd Qu.:2004  
 Denmark       : 12   2001   : 25   Max.   :580.00   Max.   :2007  
 (Other)       :228   (Other):150

Now let's plot it:

ggplot(data=data, aes(fill=Country)) +
  facet_grid(Country~.,space="free", scales="free_y") +
  opts(legend.position="none") +
  geom_ribbon(aes(x=year,ymin=-value,ymax=+value))


Using rcs' first approach, here a solution for the sample data with base graphics:

rawData <- read.csv("solar.csv", na.strings="-")
data <- ts(t(as.matrix(rawData[,2:13])), names=rawData[,1], start=1996)

inkblot <- function(series, col=NULL, min.height=40, col.value=24, col.category=17, ...) {  
  # assumes non-negative values  
  # assumes that series is multivariate series  
  # assumes that series names are set, i.e. colnames(series) != NULL  

  x <- as.vector(time(series))  
  if(length(col)==0){  
    col <- rainbow(dim(series)[2])  
  }  

  ytotal <- 0  
  for(category in colnames(series)) {  
    y <- series[, category]
    y <- y[!is.na(y)]
    ytotal <- ytotal + max(y, min.height)  
  }  

  oldpar = par(no.readonly = TRUE)   
  par(mar=c(2,3,0,10)+0.1, cex=0.7)  

  plot(x, 1:length(x), type="n", ylim=c(0,1)*ytotal, yaxt="n", xaxt="n", bty="n", ylab="", xlab="", ...) 
  axis(side=1, at=x)  

  catNumber <- 1  
  offset <- 0  
  for(category in rev(colnames(series))) {  
    print(paste("working on: ", category))
    y <- 0.5 * as.vector(series[,category])  
    offset <- offset + max(max(abs(y[!is.na(y)])), 0.5*min.height)  
    print(paste("offset= ", str(offset)))
    polygon(c(x, rev(x)), c(offset+y, offset-rev(y)), col=col[catNumber], border=NA)  
    mtext(text=y[1], side=2, at=offset, las=2, cex=0.7, col=col.value)  
    mtext(text=y[length(y)], side=4, line=-1, at=offset, las=2, cex=0.7, col=col.value)  
    mtext(text=category, side=4, line=2, at=offset, las=2, cex=0.7, col=col.category)  
    offset <- offset + max(max(abs(y[!is.na(y)])), 0.5*min.height)  
    catNumber <- catNumber + 1   
  }  
}  


inkblot(data)  

I still need to figure out the vertical grid lines and the transparent coloring.

How to create an "inkblot" chart with R?


Late to this game, but I created a stacked "blot" chart using ggplot2 and another set of data. This uses geom_polygon after the data has been smoothed out.

# data: Masaaki Ishida (luna@pos.to)
# http://luna.pos.to/whale/sta.html

head(blue, 2)
##      Season Norway U.K. Japan Panama Denmark Germany U.S.A. Netherlands
## ## [1,]   1931      0 6050     0      0       0       0      0           0
## ## [2,]   1932  10128 8496     0      0       0       0      0           0
## ##      U.S.S.R. South.Africa TOTAL
## ## [1,]        0            0  6050
## ## [2,]        0            0 18624

hourglass.plot <- function(df) {
  stack.df <- df[,-1]
  stack.df <- stack.df[,sort(colnames(stack.df))]
  stack.df <- apply(stack.df, 1, cumsum)
  stack.df <- apply(stack.df, 1, function(x) sapply(x, cumsum))
  stack.df <- t(apply(stack.df, 1, function(x) x - mean(x)))
  # use this for actual data
  ##  coords.df <- data.frame(x = rep(c(df[,1], rev(df[,1])), times = dim(stack.df)[2]), y = c(apply(stack.df, 1, min), as.numeric(apply(stack.df, 2, function(x) c(rev(x),x)))[1:(length(df[,1])*length(colnames(stack.df))*2-length(df[,1]))]), id = rep(colnames(stack.df), each = 2*length(df[,1])))

  ##  qplot(x = x, y = y, data = coords.df, geom = "polygon", color = I("white"), fill = id)

  # use this for smoothed data
  density.df <- apply(stack.df, 2, function(x) spline(x = df[,1], y = x))
  id.df <- sort(rep(colnames(stack.df), each = as.numeric(lapply(density.df, function(x) length(x$x)))))
  density.df <- do.call("rbind", lapply(density.df, as.data.frame))
  density.df <- data.frame(density.df, id = id.df)
  smooth.df <- data.frame(x = unlist(tapply(density.df$x, density.df$id, function(x) c(x, rev(x)))), y = c(apply(unstack(density.df[,2:3]), 1, min), unlist(tapply(density.df$y, density.df$id, function(x) c(rev(x), x)))[1:(table(density.df$id)[1]+2*max(cumsum(table(density.df$id))[-dim(stack.df)[2]]))]), id = rep(names(table(density.df$id)), each = 2*table(density.df$id)))

  qplot(x = x, y = y, data = smooth.df, geom = "polygon", color = I("white"), fill = id)
}

hourglass.plot(blue[,-12]) + opts(title = c("Blue Whale Catch"))

alt text http://probabilitynotes.files.wordpress.com/2010/06/bluewhalecatch.png

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜