############################################################### # # # An example of how to overlay microarray data on KEGG # # Pathways # # # # Michael Watson (michael.watson@bbsrc.ac.uk) # # Head of Bioinformatics # # Institute for Animal Health # # # # http://www.iah.ac.uk/research/bioinformatics/bioinf.shtml # # # ############################################################### ############################################################### # # # Section 1: # # # # This section is simply there for convenience - the only # # purpose here is to get some array data linked to ensembl # # identifiers, from GEO and Biomart, using GEOquery # # and biomaRt respectively. Skip this section if you # # already have something similar. # # # # The example here are for data from affy chicken # # # ############################################################### # get some data from GEO library(GEOquery) geo <- getGEO("GDS2646") tbl <- Table(geo) # convert numerical columns to numbers! for (i in 3:8) { tbl[,i] <- as.numeric(tbl[,i]) } # get link from platform to ensembl id from biomart library(biomaRt) mart <- useMart(biomart="ensembl", dataset="ggallus_gene_ensembl") affy2ens <- getBM(attributes=c("affy_chicken","ensembl_gene_id"), mart=mart) # merge array data and ensembl id myd <- merge(tbl, affy2ens, by.x="ID_REF", by.y="affy_chicken", sort=FALSE) # average over ensembl gene ids myda <- aggregate(myd[,3:8], by=list(ID=myd$ensembl_gene_id), mean) ############################################################### # # # Section 2: # # # # In this section we download some files from the KEGG # # FTP site and link them together using R's merge function. # # This example shows how we link ensembl IDs to KEGG gene IDs # # and KEGG pathways, but there are other files at # # ftp://ftp.genome.jp/pub/kegg/genes/organisms/ that can link # # other identifiers. # # # # NOTE the use of gga in the urls - this stands for # # gallus gallus as we are looking at chicken data. # # Humans will be hsa and Mouse mmu etc # # # ############################################################### # right, now to get some data from KEGG # easier to get it direct from the ftp site! t1 <- tempfile() t2 <- tempfile() t3 <- tempfile() download.file(url="ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab", destfile=t1) download.file(url="ftp://ftp.genome.jp/pub/kegg/genes/organisms/gga/gga_ensembl.list", destfile=t2) download.file(url="ftp://ftp.genome.jp/pub/kegg/genes/organisms/gga/gga_pathway.list", destfile=t3) # we want to link ensembl to KEGG gene ensembl2kegene <- read.table(t2, header=FALSE, sep="\t") # set the column names colnames(ensembl2kegene) <- c("KEGG_gene_id", "ENSEMBL_gene_id") # replace the annoying "gga:" and "ensembl:" bits ensembl2kegene$KEGG_gene_id <- sub("gga:","", ensembl2kegene$KEGG_gene_id) # sub stands for subsitute! ensembl2kegene$ENSEMBL_gene_id <- sub("ensembl:","", ensembl2kegene$ENSEMBL_gene_id) # now we want to link KEGG gene to KEGG pathway kegene2pathway <- read.table(t3, header=FALSE, sep="\t") # set the column names colnames(kegene2pathway) <- c("KEGG_gene_id", "KEGG_pathway_id") # replace the annoying "gga:" and "path:" bits kegene2pathway$KEGG_gene_id <- sub("gga:","", kegene2pathway$KEGG_gene_id) kegene2pathway$KEGG_pathway_id <- sub("path:","", kegene2pathway$KEGG_pathway_id) # we now want to make a generic id that gives us the pathway id without the gga kegene2pathway$generic_id <- sub("gga","", kegene2pathway$KEGG_pathway_id) # finally, lets import the pathway titles pathway <- read.table(t1, header=FALSE, sep="\t", quote=NULL, colClasses="character") # it's important we set the columns to character # set the column names colnames(pathway) <- c("generic_id","pathway_title") # now lets start linking those together array2kegene <- merge(myda, ensembl2kegene, by.x="ID", by.y="ENSEMBL_gene_id", sort=FALSE) # and now to pathway id array2pathwayid <- merge(array2kegene, kegene2pathway, by.x="KEGG_gene_id", by.y="KEGG_gene_id", sort=FALSE) # and now to the pathway title! array2pathway <- merge(array2pathwayid, pathway, by.x="generic_id", by.y="generic_id", sort=FALSE) ############################################################### # # # Section 3: # # # # Now we draw the pathways! We concentrate on only one # # pathway and on only one of the arrays, but this can be # # repeated for other pathways and other arrays. You may want # # to play about with the bins and the colours to get what you # # want! # # # ############################################################### # now for the KEGGSOAP stuff! library(KEGGSOAP) # limit the data to just gga04010 my.path <- array2pathway[array2pathway$KEGG_pathway_id=="gga04010",] # lets query KEGGSOAP and give it the name of our pathway # and the genes we are interested in url <- mark.pathway.by.objects("path:gga04010", my.path$KEGG_gene_id) browseURL(url) # create ten bins to put the data into # this command looks at our numerical data and decides on 10 # evenly spaced "bins" to put our data in bins <- cut(my.path$GSM158353, 10, labels=FALSE) # create ten colours between green and red library(geneplotter) cols <- dChip.colors(9) # convert the bins into the colours my.cols <- cols[bins] # this is for convenience - we will use white as the foreground fglist <- rep("white", length(my.cols)) # visualise the array data url <- color.pathway.by.objects("path:gga04010", my.path$KEGG_gene_id, fglist, my.cols) browseURL(url)