Publication Statistics (Progenetix data)

Modification by Michael: Using RCurl & read.csv (instead of read.table from url) for compatibility with RMarkdown Modified version by Qingyao: manual jittering on x-axis by PMID within a year, on y-axis by a random number generator with seed.

library(ggplot2)
library(reshape2)
library(scales)
library(RCurl)

data <- getURL("http://progenetix.org/qmongo/?db=progenetix&collection=publications_web&all=multvalues&afqfield=STATUS&querytext=.&exportk_m=PMID,YEAR,NO_CCGH,NO_ACGH,NO_WES,NO_WGS", followlocation=1L)
pubs <- read.csv(text=data, header=T, sep="\t",na="NA")
pubs <- subset(pubs, pubs$YEAR != "NA")

plotpath <- paste(Sys.getenv("HOME"), 'Documents/Rplots', sep="/")
dir.create(plotpath, showWarnings = FALSE)
nrow(pubs)
pubs <- subset(pubs, pubs$YEAR != "NA")
nrow(pubs)

allpubs <- melt(pubs, id.vars=c("YEAR","PMID"), measure.vars=c("NO_CCGH", "NO_ACGH", "NO_WES", "NO_WGS"))
allpubs <- subset(allpubs, allpubs$value > 0)

allpubs <- allpubs[order(allpubs$YEAR,xtfrm(allpubs$PMID)),]
colnames(allpubs) <- c("Year", "PMID","Technique", "Samplenumber")
allpubs$Year <- factor(allpubs$Year,levels = unique(allpubs$Year))
allpubs$PMID <- as.numeric(as.character(allpubs$PMID))
a <- sapply(levels(allpubs$Year),function(x) length(which(allpubs$Year==x)))
newYear <- do.call(c, lapply(1:length(a),function(x) (as.numeric(names(a)[x]) + seq(0,1,length.out=a[x]+1))[-1]))
allpubs$Year <- newYear
allpubs <- allpubs[,-which(colnames(allpubs) %in% "PMID")]
allpubs$Technique <- gsub("NO_", "", allpubs$Technique)
title <- paste("Cancer Samples per Publication for Different Techniques\n[", sum(allpubs$Samplenumber), " samples from ", nrow(pubs), " publications]\n", sep="")
set.seed(1)
allpubs$Samplenumber <- runif(nrow(allpubs),0,1) + allpubs$Samplenumber

allpubsdummy <- allpubs
allpubsdummy$Samplenumber <- NaN

pdf(file=paste(plotpath, "samplenumbersPubs.pdf", sep="/"), width=8, height=5)
ggplot() + scale_y_continuous(trans=log2_trans()) + geom_point(data=allpubs, aes(x=Year, y=Samplenumber, colour=Technique, size=Samplenumber), position=position_jitter(w=0, h=0), alpha=I(0.4), show.legend=FALSE) + geom_point(data=allpubsdummy, aes(x=Year, y=Samplenumber, colour=Technique), alpha=1, na.rm=TRUE) + labs(title=title) + labs(x="Year of Publication", y="Number of Cancer Samples in Publication")
dev.off()

pdf(file=paste(plotpath, "samplenumbersPubsLinear.pdf", sep="/"), width=8, height=5)
ggplot() + geom_point(data=allpubs, aes(x=Year, y=Samplenumber, colour=Technique, size=Samplenumber), position=position_jitter(w=0, h=0), alpha=I(0.4), show.legend=FALSE) + geom_point(data=allpubsdummy, aes(x=Year, y=Samplenumber, colour=Technique), alpha=1, na.rm=TRUE) + labs(title=title) + labs(x="Year of Publication", y="Number of Cancer Samples in Publication")
dev.off()
Topic revision: r9 - 17 Jan 2018, MichaelBaudis
This site is powered by FoswikiCopyright © by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Progenetixwiki? Send feedback