How to create an "inkblot" chart with R?
How can I create a chart like
http://junkcharts.typepad.com/junk_charts/2010/01/leaving-ink-traces.html
where several time series (one per country) are displayed horizontally as symmetric areas?
I think if I could display one time series in this way, it is easy to generalize to several using mfrow.
Sample data:
#Solar energy production in Europe, by country (EC),(1 000 toe)
Country,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
Belgium,1,1,1,1,1,1,2,2,3,3,3,5
Bulgaria,-,-,-,-,-,-,-,-,-,-,-,-
Czech Republic,0,0,0,0,0,0,0,0,2,2,3,4
Denmark,6,7,7,8,8,8,9,9,9,10,10,11
Germany (including ex-GDR from 1991),57,70,83,78,96,150,184,216,262,353,472,580
Estonia,-,-,-,-,-,-,-,-,-,-,-,-
Ireland,0,0,0,0,0,0,0,0,0,0,1,1
Greece,86,89,93,97,99,100,99,99,101,102,109,160
Spain,26,23,26,29,33,38,43,48,58,65,83,137
France,15,开发者_如何学运维16,17,18,26,19,19,18,19,22,29,37
Italy,8,9,11,11,12,14,16,18,21,30,38,56
Cyprus,32,33,34,35,35,34,35,36,40,41,43,54
Latvia,-,-,-,-,-,-,-,-,-,-,-,-
Lithuania,-,-,-,-,-,-,-,-,-,-,-,-
Luxembourg (Grand-Duché),0,0,0,0,0,0,0,0,1,2,2,2
Hungary,0,0,0,0,0,1,2,2,2,2,2,3
Netherlands,6,7,8,10,12,14,16,19,20,22,22,23
Austria,42,48,55,58,64,69,74,80,86,92,101,108
Poland,0,0,0,0,0,0,0,0,0,0,0,0
Portugal,16,16,17,18,18,19,20,21,21,23,24,28
Romania,0,0,0,0,0,0,0,0,0,0,0,0
Slovenia,-,-,-,-,-,-,-,-,-,-,-,-
Slovakia,0,0,0,0,0,0,0,0,0,0,0,0
Finland,0,0,0,0,1,1,1,1,1,1,1,1
Sweden,4,4,5,5,5,6,4,5,5,6,6,9
United Kingdom,6,6,7,7,11,13,16,20,25,30,37,46
Croatia,0,0,0,0,0,0,0,0,0,0,0,1
Turkey,159,179,210,236,262,287,318,350,375,385,402,420
Iceland,-,-,-,-,-,-,-,-,-,-,-,-
Norway,0,0,0,0,0,0,0,0,0,0,0,0
Switzerland,18,19,21,23,24,26,23,24,25,26,28,30
#-='Not applicable' or 'Real zero' or 'Zero by default' :=Not available "
#Source of Data:,Eurostat, http://spreadsheets.google.com/ccc?key=0Agol553XfuDZdFpCQU1CUVdPZ3M0djJBSE1za1NGV0E&hl=en_GB
#Last Update:,30.04.2009
#Date of extraction:,17 Aug 2009 07:41:12 GMT, http://epp.eurostat.ec.europa.eu/tgm/table.do?tab=table&init=1&plugin=1&language=en&pcode=ten00082
You can use polygon
in base graphics, for instance
x <- seq(as.POSIXct("1949-01-01", tz="GMT"), length=36, by="months")
y <- rnorm(length(x))
plot(x, y, type="n", ylim=c(-1,1)*max(abs(y)))
polygon(c(x, rev(x)), c(y, -rev(y)), col="cornflowerblue", border=NA)
Update: Using panel.polygon
from lattice
:
library("lattice")
library("RColorBrewer")
df <- data.frame(x=rep(x,3),
y=rnorm(3*length(x)),
variable=gl(3, length(x)))
p <- xyplot(y~x|variable, data=df,
ylim=c(-1,1)*max(abs(y)),
layout=c(1,3),
fill=brewer.pal(3, "Pastel2"),
panel=function(...) {
args <- list(...)
print(args)
panel.polygon(c(args$x, rev(args$x)),
c(args$y, -rev(args$y)),
fill=args$fill[panel.number()],
border=NA)
})
print(p)
With a little polishing, this ggplot
solution will look like what you want:
alt text http://www.imagechicken.com/uploads/1264790429056858700.png
Here's how to make it from your data:
require(ggplot2)
First, let's take your input data and import and restructure it into a form ggplot likes:
rdata = read.csv("data.csv",
# options: load '-' as na, ignore first comment line #Solar,
# strip whitespace that ends line, accept numbers as col headings
na.strings="-", skip=1, strip.white=T, check.names=F)
# Convert to long format and check years are numeric
data = melt(rdata)
data = transform(data,year=as.numeric(as.character(variable)))
# geom_ribbon hates NAs.
data = data[!is.na(data$value),]
> summary(data)
Country variable value year
Austria : 12 1996 : 25 Min. : 0.00 Min. :1996
Belgium : 12 1997 : 25 1st Qu.: 0.00 1st Qu.:1999
Croatia : 12 1998 : 25 Median : 7.00 Median :2002
Cyprus : 12 1999 : 25 Mean : 36.73 Mean :2002
Czech Republic: 12 2000 : 25 3rd Qu.: 30.00 3rd Qu.:2004
Denmark : 12 2001 : 25 Max. :580.00 Max. :2007
(Other) :228 (Other):150
Now let's plot it:
ggplot(data=data, aes(fill=Country)) +
facet_grid(Country~.,space="free", scales="free_y") +
opts(legend.position="none") +
geom_ribbon(aes(x=year,ymin=-value,ymax=+value))
Using rcs' first approach, here a solution for the sample data with base graphics:
rawData <- read.csv("solar.csv", na.strings="-")
data <- ts(t(as.matrix(rawData[,2:13])), names=rawData[,1], start=1996)
inkblot <- function(series, col=NULL, min.height=40, col.value=24, col.category=17, ...) {
# assumes non-negative values
# assumes that series is multivariate series
# assumes that series names are set, i.e. colnames(series) != NULL
x <- as.vector(time(series))
if(length(col)==0){
col <- rainbow(dim(series)[2])
}
ytotal <- 0
for(category in colnames(series)) {
y <- series[, category]
y <- y[!is.na(y)]
ytotal <- ytotal + max(y, min.height)
}
oldpar = par(no.readonly = TRUE)
par(mar=c(2,3,0,10)+0.1, cex=0.7)
plot(x, 1:length(x), type="n", ylim=c(0,1)*ytotal, yaxt="n", xaxt="n", bty="n", ylab="", xlab="", ...)
axis(side=1, at=x)
catNumber <- 1
offset <- 0
for(category in rev(colnames(series))) {
print(paste("working on: ", category))
y <- 0.5 * as.vector(series[,category])
offset <- offset + max(max(abs(y[!is.na(y)])), 0.5*min.height)
print(paste("offset= ", str(offset)))
polygon(c(x, rev(x)), c(offset+y, offset-rev(y)), col=col[catNumber], border=NA)
mtext(text=y[1], side=2, at=offset, las=2, cex=0.7, col=col.value)
mtext(text=y[length(y)], side=4, line=-1, at=offset, las=2, cex=0.7, col=col.value)
mtext(text=category, side=4, line=2, at=offset, las=2, cex=0.7, col=col.category)
offset <- offset + max(max(abs(y[!is.na(y)])), 0.5*min.height)
catNumber <- catNumber + 1
}
}
inkblot(data)
I still need to figure out the vertical grid lines and the transparent coloring.
Late to this game, but I created a stacked "blot" chart using ggplot2 and another set of data. This uses geom_polygon after the data has been smoothed out.
# data: Masaaki Ishida (luna@pos.to)
# http://luna.pos.to/whale/sta.html
head(blue, 2)
## Season Norway U.K. Japan Panama Denmark Germany U.S.A. Netherlands
## ## [1,] 1931 0 6050 0 0 0 0 0 0
## ## [2,] 1932 10128 8496 0 0 0 0 0 0
## ## U.S.S.R. South.Africa TOTAL
## ## [1,] 0 0 6050
## ## [2,] 0 0 18624
hourglass.plot <- function(df) {
stack.df <- df[,-1]
stack.df <- stack.df[,sort(colnames(stack.df))]
stack.df <- apply(stack.df, 1, cumsum)
stack.df <- apply(stack.df, 1, function(x) sapply(x, cumsum))
stack.df <- t(apply(stack.df, 1, function(x) x - mean(x)))
# use this for actual data
## coords.df <- data.frame(x = rep(c(df[,1], rev(df[,1])), times = dim(stack.df)[2]), y = c(apply(stack.df, 1, min), as.numeric(apply(stack.df, 2, function(x) c(rev(x),x)))[1:(length(df[,1])*length(colnames(stack.df))*2-length(df[,1]))]), id = rep(colnames(stack.df), each = 2*length(df[,1])))
## qplot(x = x, y = y, data = coords.df, geom = "polygon", color = I("white"), fill = id)
# use this for smoothed data
density.df <- apply(stack.df, 2, function(x) spline(x = df[,1], y = x))
id.df <- sort(rep(colnames(stack.df), each = as.numeric(lapply(density.df, function(x) length(x$x)))))
density.df <- do.call("rbind", lapply(density.df, as.data.frame))
density.df <- data.frame(density.df, id = id.df)
smooth.df <- data.frame(x = unlist(tapply(density.df$x, density.df$id, function(x) c(x, rev(x)))), y = c(apply(unstack(density.df[,2:3]), 1, min), unlist(tapply(density.df$y, density.df$id, function(x) c(rev(x), x)))[1:(table(density.df$id)[1]+2*max(cumsum(table(density.df$id))[-dim(stack.df)[2]]))]), id = rep(names(table(density.df$id)), each = 2*table(density.df$id)))
qplot(x = x, y = y, data = smooth.df, geom = "polygon", color = I("white"), fill = id)
}
hourglass.plot(blue[,-12]) + opts(title = c("Blue Whale Catch"))
alt text http://probabilitynotes.files.wordpress.com/2010/06/bluewhalecatch.png
精彩评论