################################################################################ ## Project PuMaQC - GEO Search ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## This script performs queries GEO database for arrays (GSMs) matching defined keywords ## Input: ## ini.file - INI file name with full path ## src.path - folder with PuMaQC scripts (default - online) ## Output: ## Summary report ## Table in text format with description for found GSM ## ## (c)GNU GPL P.V.Nazarov, J.P.Corte-Real, 2011 ################################################################################ PuMaQC.Search = function(ini.file, src.path = "http://sablab.net/PuMaQC") { ############# ## Libraries source("http://sablab.net/scripts/parseINI.r") source("http://sablab.net/scripts/drawTable.r") require(GEOquery) require(GEOmetadb) ## set default folder to the same folder as INI, if we are not there already if (length(grep("/",ini.file))>0){ setwd(sub("/[^/]+$","",ini.file)) }else{ ini.file = file.path(getwd(),ini.file) } Info = parseINI(ini.file,correct.bslash=T) ##-------------------------------------------------- ## makeGEOQuery: function to make SQL query ## key.inc - character vectors with include keywords (MUST be given) ## key.exc - character vectors with exclude keywords (can be skipped) ## organism - organism or "" ## gpl - character vector of GPLs makeGEOQuery = function(GEOmetadb, key.inc,key.exc="",organism="",gpl="") { fields=c("title", "source_name_ch1", "characteristics_ch1", "treatment_protocol_ch1", "description") query = "select * from gsm where" if (gpl[1]!="") { query = paste(query,"(",paste("gpl='",gpl,"'",sep="",collapse=" or "),") and") } # JCR >> #fields=c("gsm.title", # "gse.title", # "gsm.source_name_ch1", # "gsm.characteristics_ch1", # "gsm.treatment_protocol_ch1", # "gsm.description") # query = "select gse.title, gsm.* from gsm JOIN gpl ON gsm.gpl=gpl.gpl JOIN gse ON gsm.series_id=gse.gse where" ## changed # << JCR if (organism[1]!="") { query = paste(query,"(",paste("organism_ch1 like '%",organism,"%'",sep="",collapse=" or "),") and") } qus.inc = character(length(key.inc)) for (i in 1:length(key.inc)){ qus.inc[i] = paste(sprintf("%s like '%%%s%%'",fields,key.inc[i]),collapse=" or ") } if (key.exc[1]!=""){ qus.exc = character(length(key.exc)) for (i in 1:length(key.exc)){ qus.exc[i] = paste(sprintf("%s not like '%%%s%%'",fields,key.exc[i]),collapse=" and ") } } if (key.exc[1]!=""){ query = paste(query, "(", paste(qus.inc,sep="",collapse=" or "),") and", "(", paste(qus.exc,sep="",collapse=" and "),") and", "supplementary_file like '%CEL%'") ## JCR: +gsm. }else{ query = paste(query, "(", paste(qus.inc,sep="",collapse=" or "),") and", "supplementary_file like '%CEL%'") ## JCR: +gsm. } cat("Query:\n") cat(query,"\n") cat("Connecting to the database...\n") if (!file.exists(GEOmetadb)) stop(paste("Cannot find GEOmetadb at specified location:",Info$Search$GEOmetadb)) flush.console() con=dbConnect(SQLite(),GEOmetadb) geo_tables = dbListTables(con) selection = dbGetQuery(con,query) dbDisconnect(con) selection = selection[,-grep("_ch2$",names(selection))] return(selection) }## makeGEOQuery: END ################################################################################ ## Search GEOmetadb.sqlite ################################################################################ platforms = read.table(file.path(src.path,"platforms.txt"), header=T,sep="\t",quote="\"",comment.char="",as.is=T) organism = gsub("^ +","",strsplit(Info$Search$Organism,split=",")[[1]]) if (length(grep("^GPL",Info$Search$Platform[1]))>0) { gpl = gsub("^ +","",strsplit(Info$Search$Platform,split=",")[[1]]) } else { gpl=platforms$GPL[platforms$Platform == Info$Search$Platform] } if (length(gpl)==0) gpl="" if (length(organism)==0) organism="" key.inc = list() key.exc = list() ## one search or many ? if ("Include" %in% names(Info$Search)){ ## One cat("Single search:\n") search.names="s" nsearch = 1 key.inc$s = gsub("^ +","",strsplit(Info$Search$Include,split=",")[[1]]) if ("Exclude" %in% names(Info$Search)){ key.exc$s = gsub("^ +","",strsplit(Info$Search$Exclude,split=",")[[1]]) if (length(key.exc$s)==0) key.exc$s = "" }else{ key.exc$s = "" } cat(" Include:", key.inc$s,sep=",","\n") cat(" Exclude:", key.exc$s,sep=",","\n") } else{ ## Multi if ("Expression" %in% names(Info$Search)){ ## many search cat("Multiple search:\n") search.names=sub("Include","",names(Info$Search)[grep("Include",names(Info$Search))]) nsearch = length(search.names) if (!("Expression" %in% names(Info$Search))) stop("ERROR! Unable to find 'Expression' field and process multiple search") ## extract keywords for each search key.inc = list() key.exc = list() for (i in 1:nsearch){ key.inc[[i]] = gsub("^ +","",strsplit(eval(parse(text=sprintf("Info$Search$Include%s",search.names[i]))),split=",")[[1]]) if (sprintf("Exclude%s",search.names[i]) %in% names(Info$Search)){ key.exc[[i]] = gsub("^ +","",strsplit(eval(parse(text=sprintf("Info$Search$Exclude%s",search.names[i]))),split=",")[[1]]) if (length(key.exc[[i]])==0) key.exc[[i]] = "" }else key.exc[[i]] = "" } ## modify 'expr' and make safe naming of the searches: s expr = Info$Search$Expression expr = gsub("[(]"," ( ",expr) expr = gsub("[)]"," ) ",expr) expr = gsub(" and | And | AND "," & ",expr) expr = gsub(" or | Or | OR "," | ",expr) expr = gsub(" not | Not | NOT "," ! ",expr) tmp=expr for (i in 1:nsearch){ cat("Search:", search.names[i],"\n") expr = gsub(search.names[i],paste("s",search.names[i],sep=""),expr) tmp = gsub(search.names[i],"",tmp) search.names[i] = paste("s",search.names[i],sep="") cat(" Include:", key.inc[[i]],sep=",","\n") cat(" Exclude:", key.exc[[i]],sep=",","\n") } if (""!=gsub(" |[(]|[)]|[&]|[|]|[!]","",tmp)) stop("ERROR: Mistake in Expression field - check names of the search results") } else{ ## exploratory search cat("Exploratory search. No keywords.\n") nsearch = 1 } } #rm(list=c("selection","selection.lst","all.gsm")) selection.lst=list() for (isearch in 1:nsearch) { cat("Search ", search.names[isearch],". ") selection.lst[[isearch]] = makeGEOQuery(GEOmetadb=Info$Search$GEOmetadb, key.inc=key.inc[[isearch]],key.exc=key.exc[[isearch]],organism=organism,gpl=gpl) flush.console() if (isearch==1){ selection = selection.lst[[1]] }else{ selection = rbind(selection,selection.lst[[isearch]]) } } ## make combination using expression if (nsearch>1){ all.gsm=unique(selection$gsm) expr.resTable=expr resTable = data.frame(all.gsm,stringsAsFactors=F) names(resTable)="gsm" for (isearch in 1:nsearch) { resTable = data.frame(resTable, F,stringsAsFactors=F) names(resTable)[ncol(resTable)]=search.names[isearch] resTable[,isearch+1] = resTable$gsm %in% selection.lst[[isearch]]$gsm expr.resTable=gsub(names(resTable)[isearch+1], paste("resTable$",search.names[isearch],sep=""), expr.resTable) } idx = eval(parse(text=expr.resTable)) idx.selection = integer(sum(idx)) for (i in 1:sum(idx)) { idx.selection[i] = which(selection$gsm == resTable$gsm[idx][i])[1] } selection = selection[idx.selection,] } str(selection) if (nrow(selection)==0) { cat("\nNo results have been found.\n") stop("The script is stopped") } write.table(selection, file=ifelse(length(Info$Search$Results)>0,Info$Search$Results,"searchResults.txt"), sep="\t",row.names=F, col.names=T,quote=T) ################################################################################ ## Report ################################################################################ pdf("PuMaQC-1_Search.pdf",width=8.3, height=11.7,onefile=TRUE) palL = colorRampPalette(c("lightpink","palegoldenrod","palegreen","paleturquoise","lightskyblue","mediumpurple1")) palD = colorRampPalette(c("darkred","gold4","darkgreen","turquoise4","darkblue","purple4")) plot.new() title("PuMaQC Report 1. GEO Search Summary") Tmp = as.data.frame(matrix(nrow=5+length(grep),ncol=1), check.names=F,stringsAsFactors =F) Tmp = data.frame(ini.file,stringsAsFactors =F) rownames(Tmp)[nrow(Tmp)] = "INI file:" Tmp = rbind(Tmp, Info$Project$Title) rownames(Tmp)[nrow(Tmp)] = "Project:" Tmp = rbind(Tmp, Info$Project$Description) rownames(Tmp)[nrow(Tmp)] = "Description:" Tmp = rbind(Tmp, Info$Search$Organism) rownames(Tmp)[nrow(Tmp)] = "Organism:" Tmp = rbind(Tmp, Info$Search$Platform) rownames(Tmp)[nrow(Tmp)] = "Platform:" Tmp = rbind(Tmp, "") rownames(Tmp)[nrow(Tmp)] = " " if (nsearch == 1){ Tmp = rbind(Tmp, "Single search") }else{ Tmp = rbind(Tmp, paste("Multiple search. Results = ",Info$Search$Expression))} rownames(Tmp)[nrow(Tmp)] = "Search:" for (i in grep("Include|Exclude",names(Info$Search))){ Tmp = rbind(Tmp, Info$Search[[i]]) rownames(Tmp)[nrow(Tmp)] = paste(" .",names(Info$Search)[i],sep="") } Tmp = rbind(Tmp, "") rownames(Tmp)[nrow(Tmp)] = " " Tmp = rbind(Tmp, as.character(nrow(selection))) rownames(Tmp)[nrow(Tmp)] = "Res. # of samples (GSM):" Tmp = rbind(Tmp, nlevels(as.factor(selection$series_id))) rownames(Tmp)[nrow(Tmp)] = "Res. # of series (GSE):" Tmp = rbind(Tmp, nlevels(as.factor(selection$gpl))) rownames(Tmp)[nrow(Tmp)] = "Res. # of platforms (GPL):" names(Tmp)=" " drawTable(Tmp,y0 = 1.02, dx=0.28, dy=0.02,cex=0.8) par(fig=c(0.1,0.5,0.1,0.5),new=T) barplot(summary(as.factor(selection$gpl)),las=2,col=palL(nlevels(as.factor(selection$gpl))), main = "From Platforms...", ylab= "Number of GSMs", cex.main=1) par(fig=c(0.5,1,0.1,0.5),new=T) barplot(summary(as.factor(selection$series_id)),las=2,col=palL(nlevels(as.factor(selection$series_id))), main = "From Series...", ylab= "Number of GSMs", cex.main=1) dev.off() return(nrow(selection)) }